From fe90ac2b1ae3fce0ba46c54d4518849b52ac88e2 Mon Sep 17 00:00:00 2001
From: vivian chen <xuanzic@nvidia.com>
Date: Tue, 12 Sep 2023 16:20:08 +0000
Subject: [PATCH 01/69] support falcon

---
 .../convert_hf_falcon_to_nemo.py              | 355 ++++++++++++++++++
 1 file changed, 355 insertions(+)
 create mode 100644 scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
new file mode 100644
index 000000000000..20a94f8f5c54
--- /dev/null
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -0,0 +1,355 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Conversion script to convert Huggingface Falcon 1B/7B/40B/180B checkpoints into nemo checkpoint.
+
+This script will generate a Megatron model with TP=1 and PP=1. If you need different TP/PP
+values, then after running this script, please use the script located below to set the
+TP/PP values you want:
+    NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py
+    
+Example to run this conversion script:
+```
+    python convert_hf_falcon_to_nemo.py \
+     --in-file <path_to_hf_checkpoints_folder> \
+     --out-file <path_to_output_nemo_file> \
+```
+"""
+
+import os
+import logging
+import time
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.core.saving import _load_state as ptl_load_state
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import AutoTokenizer, AutoModelForCausalLM, FalconConfig
+
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPSaveRestoreConnector,
+    PipelineMixedPrecisionPlugin,
+)
+
+
+# TODO:
+# [Y] refactor ckpt func to make it cleaner
+# [Y] dict tokenizer mapping for falcon family
+# [ ] good way to add new_decoder_architecture and parallel_attn in megatron_gpt_config.yaml
+# [ ] safetensors loading. (only 180b used safetensors)
+# [ ] test on non parallel attention model （block by no alibi support?) 
+# [Y] hf config name mapping for falcon 7b and 40b.
+# [Y] trust remote code add
+# [ ] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA)
+# [ ] When bias_gelu_fusion is True, add_bias_linear must also be True. error
+# [ ] remove unnecessary comments and codes.
+
+def setup_logging(log_file="test_log.txt"):
+    logging.basicConfig(filename=log_file, level=logging.INFO, 
+                        format='%(asctime)s [%(levelname)s] - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--in-file", type=str, default=None, required=True, help="Path to Huggingface Falcon checkpoints",
+    )
+    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--precision", type=str, default="32", help="Model precision")
+    parser.add_argument("--tokenizer-type", type=str, default="tiiuae/falcon-7b", help="Tokenizer type to use, e.g., 'tiiuae/falcon-7b'."
+    )
+    args = parser.parse_args()
+    return args
+
+
+def load_model(cls, checkpoint, strict, **kwargs):
+    try:
+        if 'cfg' in kwargs:
+            model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs)
+        else:
+            model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs)
+            for name, module in model.named_parameters():
+                if name in checkpoint['state_dict']:
+                    module.data = checkpoint['state_dict'][name]
+                    checkpoint['state_dict'].pop(name)
+                else:
+                    print(f"Unexpected key: {name} not in checkpoint but in model.")
+            if len(checkpoint['state_dict'].keys()) != 0:
+                raise RuntimeError(
+                    f"Additional keys: {checkpoint['state_dict'].keys()} in checkpoint but not in model."
+                )
+
+            # register the artifacts
+            cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY]
+            if cfg.tokenizer.model is not None:
+                model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model)
+            if cfg.tokenizer.vocab_file is not None:
+                model.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file)
+            if cfg.tokenizer.merge_file is not None:
+                model.register_artifact("tokenizer.merge_file", cfg.tokenizer.merge_file)
+    finally:
+        cls._set_model_restore_state(is_being_restored=False)
+    return model
+
+
+def load_falcon_config(args) -> FalconConfig:
+    """ Helper utility to load FalconConfig.
+
+    7B and 40B are not compatible with `transformers.FalconConfig` and
+    `transformers.FalconModel`. need to manually set the config values
+    and force to `falcon` model type. 
+    """
+    config = FalconConfig.from_pretrained(args.in_file)
+
+    if config.model_type == 'RefinedWeb':
+        mappings = {
+            "num_hidden_layers": config.n_layer,
+            "num_attention_heads": config.n_head,
+            "num_kv_heads": config.n_head_kv,
+            "new_decoder_architecture": True
+        }
+    elif config.model_type == 'RefinedWebModel':
+        mappings = {
+            "num_hidden_layers": config.n_layer,
+            "num_attention_heads": config.n_head,
+            "num_kv_heads": 1 if config.multi_query else config.n_head, 
+            "new_decoder_architecture": False
+        }
+    else:
+        return config
+
+    for key, value in mappings.items():
+        setattr(config, key, value)
+
+    config.model_type = 'falcon'
+    return config
+
+
+def load_config(args):
+    falcon_config = load_falcon_config(args)
+    logging.info(f"falcon_config, {falcon_config}")
+    nemo_config = OmegaConf.load(
+        os.path.join(os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_gpt_config.yaml')
+    ).model
+    nemo_config.encoder_seq_length = falcon_config.max_position_embeddings
+    nemo_config.num_layers = int(falcon_config.num_hidden_layers)
+    nemo_config.hidden_size = falcon_config.hidden_size
+    nemo_config.num_attention_heads = falcon_config.num_attention_heads
+    nemo_config.max_position_embeddings = falcon_config.max_position_embeddings
+    nemo_config.init_method_std = falcon_config.initializer_range
+    nemo_config.layernorm_epsilon = falcon_config.layer_norm_epsilon
+    if falcon_config.alibi:
+        raise ValueError("Alibi is not yet supported in Megatron Core")
+    else:
+        nemo_config.position_embedding_type = 'rope'
+    nemo_config.bias = falcon_config.bias
+    nemo_config.hidden_dropout = falcon_config.hidden_dropout
+    nemo_config.attention_dropout = falcon_config.attention_dropout
+    # need to map to nemo config as well. 
+    tokenizer_dict = {
+        'library': 'huggingface',
+        'type': args.tokenizer_type,  # FIXME: can it work from local args.input too, fix for falcon family?
+    }
+    
+    nemo_config.tokenizer = tokenizer_dict
+    ##############################################
+    # need refactor Mcore to support parallel attn
+    #nemo_config.new_decoder_architecture = falcon_config['new_decoder_architecture'] #bool, if True, always use parallel attn
+    #nemo_config.parallel_attention = falcon_config['parallel_attn']
+    ###############################################
+    #if hasattr(falcon_config,'num_kv_heads'):
+    if falcon_config.new_decoder_architecture or falcon_config.multi_query:
+        nemo_config.num_query_groups = falcon_config.num_kv_heads
+    nemo_config.use_cpu_initialization = True
+    nemo_config.activation = 'gelu'
+    if falcon_config.rope_scaling is not None:
+        if falcon_config.rope_scaling.type == 'linear':
+            nemo_config['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor
+        else:
+            raise ValueError("Only linear rope scaling type is supported now")
+
+    base = 128
+    while falcon_config.vocab_size % base != 0:
+        base //= 2
+    nemo_config.make_vocab_size_divisible_by = base
+
+    return nemo_config
+
+
+def convert(args):
+    logging.info(f"loading checkpoint {args.in_file}")
+    tik = time.time()
+    model = AutoModelForCausalLM.from_pretrained(args.in_file, trust_remote_code=True)
+    #tokenizer = AutoTokenizer.from_pretrained(args.in_file)
+    hf_config = load_falcon_config(args)
+      
+    # print(f"hf_config: {hf_config}")
+    # print("named parameters:")
+    # for name, param in model.named_parameters():
+    #     print(f"- {name}")
+    
+    # add debug state dict list
+    hf_keys = list(model.state_dict().keys())
+
+    nemo_config = load_config(args)
+
+    if args.precision in ["32", "16"]:
+        precision = int(float(args.precision))
+    elif args.precision in ["bf16", "bf16-mixed"]:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            precision = args.precision
+        else:
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            precision = args.precision[2:]  # prune bf in string
+    else:
+        precision = args.precision
+
+    plugins = []
+    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32),
+                growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
+                hysteresis=nemo_config.get('hysteresis', 2),
+            )
+            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+
+        if nemo_config.get('megatron_amp_O2', False):
+            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+
+    if precision == 32:
+        dtype = torch.float32
+    elif precision in [16, "16", "16-mixed"]:
+        dtype = torch.float16
+    elif precision == ["bf16", "bf16-mixed"]:
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.float32  # fallback
+
+    nemo_config.precision = precision
+
+    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision)
+
+    hidden_size = hf_config.hidden_size
+    head_num = hf_config.num_attention_heads
+    head_size = hidden_size // head_num 
+    num_layers = hf_config.num_hidden_layers
+    
+    nemo_config.mcore_gpt = True
+    nemo_config.transformer_engine = True
+    logging.info(f"nemo_config {nemo_config}")
+    logging.info(f"mcore_gpt: {nemo_config.mcore_gpt}")
+    logging.info(f"transformer_engine: {nemo_config.transformer_engine}")
+    # assert nemo_config.mcore_gpt == nemo_config.get(
+    #     'transformer_engine', False
+    # ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+
+    param_to_weights = lambda param: param.float()
+
+    checkpoint = OrderedDict()
+    checkpoint['state_dict'] = OrderedDict()
+    
+    def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias):
+        source_name = f"{source_prefix}.{weight_or_bias}"
+        if source_name in model.state_dict():
+            target_name = f"{target_prefix}.{weight_or_bias}"
+            checkpoint['state_dict'][target_name] = param_to_weights(model.state_dict()[source_name])
+            
+            # add debug remove mapped keys
+            if source_name in hf_keys:
+                hf_keys.remove(source_name)
+
+    def add_weight_and_possible_bias(source_prefix, target_prefix):
+        add_to_checkpoint(source_prefix, target_prefix, 'weight')
+        if f"{source_prefix}.bias" in model.state_dict():
+            add_to_checkpoint(source_prefix, target_prefix, 'bias')
+    
+    add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight')
+
+    if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
+        num_query_groups = head_num
+    else:
+        num_query_groups = nemo_config.num_query_groups
+        assert head_num % num_query_groups == 0, f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})'
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+        prefix = f'transformer.h.{l}'
+        
+
+        # HF: [num_heads x 3 x head_dim, hidden_size], interleaved qkv weights
+        # Query types and expected kv heads.
+        #  - MHA: num_heads = num_kv_heads
+        #  - Multi-Query Attention: num_kv_heads = 1
+        #  - Grouped-Query Attention: num_heads % num_kv_heads = 0
+
+        add_weight_and_possible_bias(f'{prefix}.self_attention.query_key_value', f'model.decoder.layers.{l}.self_attention.linear_qkv')
+        add_weight_and_possible_bias(f'{prefix}.self_attention.dense', f'model.decoder.layers.{l}.self_attention.linear_proj')
+        add_weight_and_possible_bias(f'{prefix}.mlp.dense_h_to_4h', f'model.decoder.layers.{l}.mlp.linear_fc1')
+        add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2')
+
+        if hf_config.new_decoder_architecture:
+            add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm')
+            add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm')
+        else:
+            add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm')
+            if not hf_config.parallel_attn:
+                add_weight_and_possible_bias(f'{prefix}.post_attention_layernorm', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm')
+
+        print(f"done layer {l}")
+
+    # final layer norm
+    add_weight_and_possible_bias('transformer.ln_f', 'model.decoder.final_layernorm')
+
+    # LM weight
+    add_to_checkpoint('lm_head', 'model.output_layer','weight')
+    
+    if hf_keys:
+        logging.warning(f"Some keys in HuggingFace's model didn't get mapped to NeMo's state_dict: {hf_keys}")
+    
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+    
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logging.info(f'Weights loaded. Total time: {t}')
+
+    del model
+
+    model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
+
+    model._save_restore_connector = NLPSaveRestoreConnector()
+
+    # cast to target precision and disable cpu init
+    model = model.to(dtype=dtype)
+    model.cfg.use_cpu_initialization = False
+
+    model.save_to(args.out_file)
+    logging.info(f'NeMo model saved to: {args.out_file}')
+
+
+if __name__ == '__main__':
+    setup_logging()
+    args = get_args()
+    convert(args)
\ No newline at end of file

From ffaf2289aedca120a794c6648674ad4c38206d9c Mon Sep 17 00:00:00 2001
From: vivian chen <xuanzic@nvidia.com>
Date: Tue, 12 Sep 2023 18:01:25 +0000
Subject: [PATCH 02/69] support falcon bug fix layernorm naming

---
 .../convert_hf_falcon_to_nemo.py              | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index 20a94f8f5c54..b7b3ed55e3b7 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -89,7 +89,7 @@ def load_model(cls, checkpoint, strict, **kwargs):
                     module.data = checkpoint['state_dict'][name]
                     checkpoint['state_dict'].pop(name)
                 else:
-                    print(f"Unexpected key: {name} not in checkpoint but in model.")
+                    logging.info(f"Unexpected key: {name} not in checkpoint but in model.")
             if len(checkpoint['state_dict'].keys()) != 0:
                 raise RuntimeError(
                     f"Additional keys: {checkpoint['state_dict'].keys()} in checkpoint but not in model."
@@ -259,6 +259,8 @@ def convert(args):
     
     nemo_config.mcore_gpt = True
     nemo_config.transformer_engine = True
+    nemo_config.bias_activation_fusion = False
+    
     logging.info(f"nemo_config {nemo_config}")
     logging.info(f"mcore_gpt: {nemo_config.mcore_gpt}")
     logging.info(f"transformer_engine: {nemo_config.transformer_engine}")
@@ -271,20 +273,23 @@ def convert(args):
     checkpoint = OrderedDict()
     checkpoint['state_dict'] = OrderedDict()
     
-    def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias):
+    def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias, is_layernorm=False):
         source_name = f"{source_prefix}.{weight_or_bias}"
         if source_name in model.state_dict():
-            target_name = f"{target_prefix}.{weight_or_bias}"
+            if is_layernorm:
+                target_name = f"{target_prefix}_{weight_or_bias}"
+            else:
+                target_name = f"{target_prefix}.{weight_or_bias}"
             checkpoint['state_dict'][target_name] = param_to_weights(model.state_dict()[source_name])
             
             # add debug remove mapped keys
             if source_name in hf_keys:
                 hf_keys.remove(source_name)
 
-    def add_weight_and_possible_bias(source_prefix, target_prefix):
-        add_to_checkpoint(source_prefix, target_prefix, 'weight')
+    def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=False):
+        add_to_checkpoint(source_prefix, target_prefix, 'weight', is_layernorm)
         if f"{source_prefix}.bias" in model.state_dict():
-            add_to_checkpoint(source_prefix, target_prefix, 'bias')
+            add_to_checkpoint(source_prefix, target_prefix, 'bias', is_layernorm)
     
     add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight')
 
@@ -311,12 +316,12 @@ def add_weight_and_possible_bias(source_prefix, target_prefix):
         add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2')
 
         if hf_config.new_decoder_architecture:
-            add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm')
-            add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm')
+            add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
+            add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
         else:
-            add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm')
+            add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
             if not hf_config.parallel_attn:
-                add_weight_and_possible_bias(f'{prefix}.post_attention_layernorm', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm')
+                add_weight_and_possible_bias(f'{prefix}.post_attention_layernorm', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True)
 
         print(f"done layer {l}")
 

From 562e6f08a9129fdd88b5411b67c3811e86af4ec3 Mon Sep 17 00:00:00 2001
From: vivian chen <xuanzic@nvidia.com>
Date: Wed, 13 Sep 2023 00:37:36 +0000
Subject: [PATCH 03/69] fix todo

---
 .../convert_hf_falcon_to_nemo.py              | 186 +++++++-----------
 1 file changed, 76 insertions(+), 110 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index b7b3ed55e3b7..99e7866e43a2 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -25,6 +25,8 @@
     python convert_hf_falcon_to_nemo.py \
      --in-file <path_to_hf_checkpoints_folder> \
      --out-file <path_to_output_nemo_file> \
+     --tokenizer-type <model_id on hf> \
+     --precision <precision of converted nemo model>
 ```
 """
 
@@ -55,15 +57,15 @@
 # [Y] dict tokenizer mapping for falcon family
 # [ ] good way to add new_decoder_architecture and parallel_attn in megatron_gpt_config.yaml
 # [ ] safetensors loading. (only 180b used safetensors)
-# [ ] test on non parallel attention model （block by no alibi support?) 
+# [Y] test on non parallel attention model （block by no alibi support? 1b-rw good, 7b-rw still some time) 
 # [Y] hf config name mapping for falcon 7b and 40b.
 # [Y] trust remote code add
-# [ ] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA)
-# [ ] When bias_gelu_fusion is True, add_bias_linear must also be True. error
+# [Y] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA)
+# [Y] When bias_gelu_fusion is True, add_bias_linear must also be True. error
 # [ ] remove unnecessary comments and codes.
 
 def setup_logging(log_file="test_log.txt"):
-    logging.basicConfig(filename=log_file, level=logging.INFO, 
+    logging.basicConfig(filename=log_file, level=logging.DEBUG, 
                         format='%(asctime)s [%(levelname)s] - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
 def get_args():
     parser = ArgumentParser()
@@ -94,15 +96,6 @@ def load_model(cls, checkpoint, strict, **kwargs):
                 raise RuntimeError(
                     f"Additional keys: {checkpoint['state_dict'].keys()} in checkpoint but not in model."
                 )
-
-            # register the artifacts
-            cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY]
-            if cfg.tokenizer.model is not None:
-                model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model)
-            if cfg.tokenizer.vocab_file is not None:
-                model.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file)
-            if cfg.tokenizer.merge_file is not None:
-                model.register_artifact("tokenizer.merge_file", cfg.tokenizer.merge_file)
     finally:
         cls._set_model_restore_state(is_being_restored=False)
     return model
@@ -111,7 +104,7 @@ def load_model(cls, checkpoint, strict, **kwargs):
 def load_falcon_config(args) -> FalconConfig:
     """ Helper utility to load FalconConfig.
 
-    7B and 40B are not compatible with `transformers.FalconConfig` and
+    Falcon-7B and Falcon-40B are not compatible with `transformers.FalconConfig` and
     `transformers.FalconModel`. need to manually set the config values
     and force to `falcon` model type. 
     """
@@ -141,7 +134,7 @@ def load_falcon_config(args) -> FalconConfig:
     return config
 
 
-def load_config(args):
+def load_nemo_config(args):
     falcon_config = load_falcon_config(args)
     logging.info(f"falcon_config, {falcon_config}")
     nemo_config = OmegaConf.load(
@@ -154,14 +147,18 @@ def load_config(args):
     nemo_config.max_position_embeddings = falcon_config.max_position_embeddings
     nemo_config.init_method_std = falcon_config.initializer_range
     nemo_config.layernorm_epsilon = falcon_config.layer_norm_epsilon
-    if falcon_config.alibi:
-        raise ValueError("Alibi is not yet supported in Megatron Core")
-    else:
+    try: 
+        if falcon_config.alibi:
+            raise ValueError("Alibi is not yet supported in Megatron Core, \
+                force to use RoPE will generate suboptimal responses")
+    except ValueError as e:
+        print(e)
+    finally:
         nemo_config.position_embedding_type = 'rope'
     nemo_config.bias = falcon_config.bias
     nemo_config.hidden_dropout = falcon_config.hidden_dropout
     nemo_config.attention_dropout = falcon_config.attention_dropout
-    # need to map to nemo config as well. 
+    # TODO: how does vocab_file, merge_file etc get mapped automatically in respect to variants of falcon models?
     tokenizer_dict = {
         'library': 'huggingface',
         'type': args.tokenizer_type,  # FIXME: can it work from local args.input too, fix for falcon family?
@@ -169,13 +166,12 @@ def load_config(args):
     
     nemo_config.tokenizer = tokenizer_dict
     ##############################################
-    # need refactor Mcore to support parallel attn
+    # TODO: need refactor Mcore to support parallel attn and 40b/180b model arch
     #nemo_config.new_decoder_architecture = falcon_config['new_decoder_architecture'] #bool, if True, always use parallel attn
     #nemo_config.parallel_attention = falcon_config['parallel_attn']
     ###############################################
-    #if hasattr(falcon_config,'num_kv_heads'):
-    if falcon_config.new_decoder_architecture or falcon_config.multi_query:
-        nemo_config.num_query_groups = falcon_config.num_kv_heads
+
+    nemo_config.num_query_groups = falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None
     nemo_config.use_cpu_initialization = True
     nemo_config.activation = 'gelu'
     if falcon_config.rope_scaling is not None:
@@ -183,7 +179,11 @@ def load_config(args):
             nemo_config['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor
         else:
             raise ValueError("Only linear rope scaling type is supported now")
-
+    
+    nemo_config.mcore_gpt = True
+    nemo_config.transformer_engine = True
+    nemo_config.bias_activation_fusion = False
+    
     base = 128
     while falcon_config.vocab_size % base != 0:
         base //= 2
@@ -191,82 +191,64 @@ def load_config(args):
 
     return nemo_config
 
+def determine_precision(args):
+    """Helper function to determine the precision of model
+    """
+    if args.precision in ["32", "16"]:
+        return int(args.precision)
+    elif args.precision in ["bf16", "bf16-mixed"]:
+        if not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()):
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            return args.precision[2:]  # prune 'bf' from string
+    return args.precision
+
+def determine_dtype(precision):
+    dtype_map = {
+        "32": torch.float32,
+        "16": torch.float16,
+        "16-mixed": torch.float16,
+        "bf16": torch.bfloat16,
+        "bf16-mixed": torch.bfloat16
+    }
+    return dtype_map.get(precision, torch.float32)  # default to torch.float32
 
 def convert(args):
     logging.info(f"loading checkpoint {args.in_file}")
     tik = time.time()
     model = AutoModelForCausalLM.from_pretrained(args.in_file, trust_remote_code=True)
-    #tokenizer = AutoTokenizer.from_pretrained(args.in_file)
-    hf_config = load_falcon_config(args)
-      
-    # print(f"hf_config: {hf_config}")
-    # print("named parameters:")
-    # for name, param in model.named_parameters():
-    #     print(f"- {name}")
-    
-    # add debug state dict list
-    hf_keys = list(model.state_dict().keys())
+    falcon_config = load_falcon_config(args)
+    # debug
+    logging.debug(f"initial falcon_config, {falcon_config}")
 
-    nemo_config = load_config(args)
-
-    if args.precision in ["32", "16"]:
-        precision = int(float(args.precision))
-    elif args.precision in ["bf16", "bf16-mixed"]:
-        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
-            precision = args.precision
-        else:
-            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
-            precision = args.precision[2:]  # prune bf in string
-    else:
-        precision = args.precision
+    nemo_config = load_nemo_config(args)
+    precision = determine_precision(args)
 
     plugins = []
-    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
-        scaler = None
-        if precision in [16, '16', '16-mixed']:
-            scaler = GradScaler(
-                init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32),
-                growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
-                hysteresis=nemo_config.get('hysteresis', 2),
-            )
-            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
-            plugin_precision = '16-mixed'
-        else:
-            plugin_precision = 'bf16-mixed'
-
-        if nemo_config.get('megatron_amp_O2', False):
-            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-        else:
-            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-
-    if precision == 32:
-        dtype = torch.float32
-    elif precision in [16, "16", "16-mixed"]:
-        dtype = torch.float16
-    elif precision == ["bf16", "bf16-mixed"]:
-        dtype = torch.bfloat16
-    else:
-        dtype = torch.float32  # fallback
+    
+    if precision in ['16', '16-mixed', 'bf16', 'bf16-mixed']:
+        scaler_params = {
+            'init_scale': nemo_config.get('native_amp_init_scale', 2 ** 32),
+            'growth_interval': nemo_config.get('native_amp_growth_interval', 1000),
+            'hysteresis': nemo_config.get('hysteresis', 2)
+        }
+    
+        plugin_precision = '16-mixed' if precision in ['16', '16-mixed'] else 'bf16-mixed'
+        scaler = GradScaler(**scaler_params) if precision in ['16', '16-mixed'] else None
 
+    dtype = determine_dtype(precision)
     nemo_config.precision = precision
-
     trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision)
 
-    hidden_size = hf_config.hidden_size
-    head_num = hf_config.num_attention_heads
+    hidden_size = falcon_config.hidden_size
+    head_num = falcon_config.num_attention_heads
     head_size = hidden_size // head_num 
-    num_layers = hf_config.num_hidden_layers
+    num_layers = falcon_config.num_hidden_layers
     
-    nemo_config.mcore_gpt = True
-    nemo_config.transformer_engine = True
-    nemo_config.bias_activation_fusion = False
-    
-    logging.info(f"nemo_config {nemo_config}")
-    logging.info(f"mcore_gpt: {nemo_config.mcore_gpt}")
-    logging.info(f"transformer_engine: {nemo_config.transformer_engine}")
-    # assert nemo_config.mcore_gpt == nemo_config.get(
-    #     'transformer_engine', False
-    # ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+    #  - MHA: num_heads = num_kv_heads
+    #  - Multi-Query Attention: num_kv_heads = 1
+    #  - Grouped-Query Attention: num_heads % num_kv_heads = 0
+    num_query_groups = nemo_config.num_query_groups if nemo_config.num_query_groups and nemo_config.num_query_groups != head_num else head_num
+    assert head_num % num_query_groups == 0, f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})'
 
     param_to_weights = lambda param: param.float()
 
@@ -281,10 +263,6 @@ def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias, is_layernorm
             else:
                 target_name = f"{target_prefix}.{weight_or_bias}"
             checkpoint['state_dict'][target_name] = param_to_weights(model.state_dict()[source_name])
-            
-            # add debug remove mapped keys
-            if source_name in hf_keys:
-                hf_keys.remove(source_name)
 
     def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=False):
         add_to_checkpoint(source_prefix, target_prefix, 'weight', is_layernorm)
@@ -293,34 +271,21 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
     
     add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight')
 
-    if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
-        num_query_groups = head_num
-    else:
-        num_query_groups = nemo_config.num_query_groups
-        assert head_num % num_query_groups == 0, f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})'
-
     for l in range(int(num_layers)):
         print(f"converting layer {l}")
         prefix = f'transformer.h.{l}'
         
-
-        # HF: [num_heads x 3 x head_dim, hidden_size], interleaved qkv weights
-        # Query types and expected kv heads.
-        #  - MHA: num_heads = num_kv_heads
-        #  - Multi-Query Attention: num_kv_heads = 1
-        #  - Grouped-Query Attention: num_heads % num_kv_heads = 0
-
         add_weight_and_possible_bias(f'{prefix}.self_attention.query_key_value', f'model.decoder.layers.{l}.self_attention.linear_qkv')
         add_weight_and_possible_bias(f'{prefix}.self_attention.dense', f'model.decoder.layers.{l}.self_attention.linear_proj')
         add_weight_and_possible_bias(f'{prefix}.mlp.dense_h_to_4h', f'model.decoder.layers.{l}.mlp.linear_fc1')
         add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2')
 
-        if hf_config.new_decoder_architecture:
+        if falcon_config.new_decoder_architecture:
             add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
             add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
         else:
             add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
-            if not hf_config.parallel_attn:
+            if not falcon_config.parallel_attn:
                 add_weight_and_possible_bias(f'{prefix}.post_attention_layernorm', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True)
 
         print(f"done layer {l}")
@@ -331,10 +296,8 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
     # LM weight
     add_to_checkpoint('lm_head', 'model.output_layer','weight')
     
-    if hf_keys:
-        logging.warning(f"Some keys in HuggingFace's model didn't get mapped to NeMo's state_dict: {hf_keys}")
-    
     checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+    logging.debug(f'final checkpoint, {checkpoint}')
     
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
@@ -342,14 +305,17 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
 
     del model
 
-    model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
+    #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
+    model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], strict=False, trainer=trainer)
 
     model._save_restore_connector = NLPSaveRestoreConnector()
 
     # cast to target precision and disable cpu init
     model = model.to(dtype=dtype)
     model.cfg.use_cpu_initialization = False
-
+    # We make sure that the tokenizer can be instantiated later regardless of args.input
+    model.cfg.tokenizer.update(type=args.tokenizer_type)
+    # save model
     model.save_to(args.out_file)
     logging.info(f'NeMo model saved to: {args.out_file}')
 

From 8297b5ccb9b4c489e25316777cf8181f24712ebe Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 13 Sep 2023 17:55:43 +0000
Subject: [PATCH 04/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../convert_hf_falcon_to_nemo.py              | 124 ++++++++++++------
 1 file changed, 82 insertions(+), 42 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index 99e7866e43a2..2fcaae6a0f75 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -30,8 +30,8 @@
 ```
 """
 
-import os
 import logging
+import os
 import time
 from argparse import ArgumentParser
 from collections import OrderedDict
@@ -40,8 +40,7 @@
 from omegaconf import OmegaConf
 from pytorch_lightning.core.saving import _load_state as ptl_load_state
 from pytorch_lightning.trainer.trainer import Trainer
-from transformers import AutoTokenizer, AutoModelForCausalLM, FalconConfig
-
+from transformers import AutoModelForCausalLM, AutoTokenizer, FalconConfig
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.nlp_overrides import (
@@ -51,22 +50,28 @@
     PipelineMixedPrecisionPlugin,
 )
 
-
 # TODO:
 # [Y] refactor ckpt func to make it cleaner
 # [Y] dict tokenizer mapping for falcon family
 # [ ] good way to add new_decoder_architecture and parallel_attn in megatron_gpt_config.yaml
 # [ ] safetensors loading. (only 180b used safetensors)
-# [Y] test on non parallel attention model （block by no alibi support? 1b-rw good, 7b-rw still some time) 
+# [Y] test on non parallel attention model （block by no alibi support? 1b-rw good, 7b-rw still some time)
 # [Y] hf config name mapping for falcon 7b and 40b.
 # [Y] trust remote code add
 # [Y] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA)
 # [Y] When bias_gelu_fusion is True, add_bias_linear must also be True. error
 # [ ] remove unnecessary comments and codes.
 
+
 def setup_logging(log_file="test_log.txt"):
-    logging.basicConfig(filename=log_file, level=logging.DEBUG, 
-                        format='%(asctime)s [%(levelname)s] - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
+    logging.basicConfig(
+        filename=log_file,
+        level=logging.DEBUG,
+        format='%(asctime)s [%(levelname)s] - %(message)s',
+        datefmt='%d-%b-%y %H:%M:%S',
+    )
+
+
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
@@ -74,7 +79,11 @@ def get_args():
     )
     parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.")
     parser.add_argument("--precision", type=str, default="32", help="Model precision")
-    parser.add_argument("--tokenizer-type", type=str, default="tiiuae/falcon-7b", help="Tokenizer type to use, e.g., 'tiiuae/falcon-7b'."
+    parser.add_argument(
+        "--tokenizer-type",
+        type=str,
+        default="tiiuae/falcon-7b",
+        help="Tokenizer type to use, e.g., 'tiiuae/falcon-7b'.",
     )
     args = parser.parse_args()
     return args
@@ -115,14 +124,14 @@ def load_falcon_config(args) -> FalconConfig:
             "num_hidden_layers": config.n_layer,
             "num_attention_heads": config.n_head,
             "num_kv_heads": config.n_head_kv,
-            "new_decoder_architecture": True
+            "new_decoder_architecture": True,
         }
     elif config.model_type == 'RefinedWebModel':
         mappings = {
             "num_hidden_layers": config.n_layer,
             "num_attention_heads": config.n_head,
-            "num_kv_heads": 1 if config.multi_query else config.n_head, 
-            "new_decoder_architecture": False
+            "num_kv_heads": 1 if config.multi_query else config.n_head,
+            "new_decoder_architecture": False,
         }
     else:
         return config
@@ -147,10 +156,12 @@ def load_nemo_config(args):
     nemo_config.max_position_embeddings = falcon_config.max_position_embeddings
     nemo_config.init_method_std = falcon_config.initializer_range
     nemo_config.layernorm_epsilon = falcon_config.layer_norm_epsilon
-    try: 
+    try:
         if falcon_config.alibi:
-            raise ValueError("Alibi is not yet supported in Megatron Core, \
-                force to use RoPE will generate suboptimal responses")
+            raise ValueError(
+                "Alibi is not yet supported in Megatron Core, \
+                force to use RoPE will generate suboptimal responses"
+            )
     except ValueError as e:
         print(e)
     finally:
@@ -163,15 +174,17 @@ def load_nemo_config(args):
         'library': 'huggingface',
         'type': args.tokenizer_type,  # FIXME: can it work from local args.input too, fix for falcon family?
     }
-    
+
     nemo_config.tokenizer = tokenizer_dict
     ##############################################
     # TODO: need refactor Mcore to support parallel attn and 40b/180b model arch
-    #nemo_config.new_decoder_architecture = falcon_config['new_decoder_architecture'] #bool, if True, always use parallel attn
-    #nemo_config.parallel_attention = falcon_config['parallel_attn']
+    # nemo_config.new_decoder_architecture = falcon_config['new_decoder_architecture'] #bool, if True, always use parallel attn
+    # nemo_config.parallel_attention = falcon_config['parallel_attn']
     ###############################################
 
-    nemo_config.num_query_groups = falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None
+    nemo_config.num_query_groups = (
+        falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None
+    )
     nemo_config.use_cpu_initialization = True
     nemo_config.activation = 'gelu'
     if falcon_config.rope_scaling is not None:
@@ -179,11 +192,11 @@ def load_nemo_config(args):
             nemo_config['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor
         else:
             raise ValueError("Only linear rope scaling type is supported now")
-    
+
     nemo_config.mcore_gpt = True
     nemo_config.transformer_engine = True
     nemo_config.bias_activation_fusion = False
-    
+
     base = 128
     while falcon_config.vocab_size % base != 0:
         base //= 2
@@ -191,6 +204,7 @@ def load_nemo_config(args):
 
     return nemo_config
 
+
 def determine_precision(args):
     """Helper function to determine the precision of model
     """
@@ -202,16 +216,18 @@ def determine_precision(args):
             return args.precision[2:]  # prune 'bf' from string
     return args.precision
 
+
 def determine_dtype(precision):
     dtype_map = {
         "32": torch.float32,
         "16": torch.float16,
         "16-mixed": torch.float16,
         "bf16": torch.bfloat16,
-        "bf16-mixed": torch.bfloat16
+        "bf16-mixed": torch.bfloat16,
     }
     return dtype_map.get(precision, torch.float32)  # default to torch.float32
 
+
 def convert(args):
     logging.info(f"loading checkpoint {args.in_file}")
     tik = time.time()
@@ -224,14 +240,14 @@ def convert(args):
     precision = determine_precision(args)
 
     plugins = []
-    
+
     if precision in ['16', '16-mixed', 'bf16', 'bf16-mixed']:
         scaler_params = {
             'init_scale': nemo_config.get('native_amp_init_scale', 2 ** 32),
             'growth_interval': nemo_config.get('native_amp_growth_interval', 1000),
-            'hysteresis': nemo_config.get('hysteresis', 2)
+            'hysteresis': nemo_config.get('hysteresis', 2),
         }
-    
+
         plugin_precision = '16-mixed' if precision in ['16', '16-mixed'] else 'bf16-mixed'
         scaler = GradScaler(**scaler_params) if precision in ['16', '16-mixed'] else None
 
@@ -241,20 +257,26 @@ def convert(args):
 
     hidden_size = falcon_config.hidden_size
     head_num = falcon_config.num_attention_heads
-    head_size = hidden_size // head_num 
+    head_size = hidden_size // head_num
     num_layers = falcon_config.num_hidden_layers
-    
+
     #  - MHA: num_heads = num_kv_heads
     #  - Multi-Query Attention: num_kv_heads = 1
     #  - Grouped-Query Attention: num_heads % num_kv_heads = 0
-    num_query_groups = nemo_config.num_query_groups if nemo_config.num_query_groups and nemo_config.num_query_groups != head_num else head_num
-    assert head_num % num_query_groups == 0, f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})'
+    num_query_groups = (
+        nemo_config.num_query_groups
+        if nemo_config.num_query_groups and nemo_config.num_query_groups != head_num
+        else head_num
+    )
+    assert (
+        head_num % num_query_groups == 0
+    ), f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})'
 
     param_to_weights = lambda param: param.float()
 
     checkpoint = OrderedDict()
     checkpoint['state_dict'] = OrderedDict()
-    
+
     def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias, is_layernorm=False):
         source_name = f"{source_prefix}.{weight_or_bias}"
         if source_name in model.state_dict():
@@ -268,25 +290,43 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
         add_to_checkpoint(source_prefix, target_prefix, 'weight', is_layernorm)
         if f"{source_prefix}.bias" in model.state_dict():
             add_to_checkpoint(source_prefix, target_prefix, 'bias', is_layernorm)
-    
+
     add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight')
 
     for l in range(int(num_layers)):
         print(f"converting layer {l}")
         prefix = f'transformer.h.{l}'
-        
-        add_weight_and_possible_bias(f'{prefix}.self_attention.query_key_value', f'model.decoder.layers.{l}.self_attention.linear_qkv')
-        add_weight_and_possible_bias(f'{prefix}.self_attention.dense', f'model.decoder.layers.{l}.self_attention.linear_proj')
+
+        add_weight_and_possible_bias(
+            f'{prefix}.self_attention.query_key_value', f'model.decoder.layers.{l}.self_attention.linear_qkv'
+        )
+        add_weight_and_possible_bias(
+            f'{prefix}.self_attention.dense', f'model.decoder.layers.{l}.self_attention.linear_proj'
+        )
         add_weight_and_possible_bias(f'{prefix}.mlp.dense_h_to_4h', f'model.decoder.layers.{l}.mlp.linear_fc1')
         add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2')
 
         if falcon_config.new_decoder_architecture:
-            add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
-            add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
+            add_weight_and_possible_bias(
+                f'{prefix}.ln_attn',
+                f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm',
+                is_layernorm=True,
+            )
+            add_weight_and_possible_bias(
+                f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True
+            )
         else:
-            add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
+            add_weight_and_possible_bias(
+                f'{prefix}.input_layernorm',
+                f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm',
+                is_layernorm=True,
+            )
             if not falcon_config.parallel_attn:
-                add_weight_and_possible_bias(f'{prefix}.post_attention_layernorm', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True)
+                add_weight_and_possible_bias(
+                    f'{prefix}.post_attention_layernorm',
+                    f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm',
+                    is_layernorm=True,
+                )
 
         print(f"done layer {l}")
 
@@ -294,18 +334,18 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
     add_weight_and_possible_bias('transformer.ln_f', 'model.decoder.final_layernorm')
 
     # LM weight
-    add_to_checkpoint('lm_head', 'model.output_layer','weight')
-    
+    add_to_checkpoint('lm_head', 'model.output_layer', 'weight')
+
     checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
     logging.debug(f'final checkpoint, {checkpoint}')
-    
+
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
     logging.info(f'Weights loaded. Total time: {t}')
 
     del model
 
-    #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
+    # model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
     model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], strict=False, trainer=trainer)
 
     model._save_restore_connector = NLPSaveRestoreConnector()
@@ -323,4 +363,4 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
 if __name__ == '__main__':
     setup_logging()
     args = get_args()
-    convert(args)
\ No newline at end of file
+    convert(args)

From 2fc07a40ae19627637248ff03c8d48bd31044f77 Mon Sep 17 00:00:00 2001
From: vivian <xuanzic@nvidia.com>
Date: Tue, 19 Sep 2023 20:05:42 +0000
Subject: [PATCH 05/69] fix for new architecture

---
 scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index 99e7866e43a2..896310ecd3bd 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -183,6 +183,7 @@ def load_nemo_config(args):
     nemo_config.mcore_gpt = True
     nemo_config.transformer_engine = True
     nemo_config.bias_activation_fusion = False
+    nemo_config.bias_dropout_add_fusion = False
     
     base = 128
     while falcon_config.vocab_size % base != 0:
@@ -282,7 +283,7 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
 
         if falcon_config.new_decoder_architecture:
             add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
-            add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
+            add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True)
         else:
             add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
             if not falcon_config.parallel_attn:
@@ -306,7 +307,7 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
     del model
 
     #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
-    model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], strict=False, trainer=trainer)
+    model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer)
 
     model._save_restore_connector = NLPSaveRestoreConnector()
 

From 9bafd733698cd8e8ae59b4eecb24e54c0b99977a Mon Sep 17 00:00:00 2001
From: vivian <xuanzic@nvidia.com>
Date: Tue, 19 Sep 2023 20:13:40 +0000
Subject: [PATCH 06/69] new transformerlayer for falcon

---
 .../language_modeling/megatron/__init__.py    |   1 +
 .../megatron/falcon_mcore/falcon_gpt_model.py | 309 ++++++++++++++++++
 .../falcon_mcore/falcon_transformer_block.py  | 287 ++++++++++++++++
 .../falcon_mcore/falcon_transformer_config.py | 273 ++++++++++++++++
 .../falcon_mcore/falcon_transformer_layer.py  | 271 +++++++++++++++
 5 files changed, 1141 insertions(+)
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
index 3afb1e3fae48..0a7cab62e240 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
@@ -16,6 +16,7 @@
 
 try:
     from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
+    from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_gpt_model import FalconGPTModel
 
     HAVE_MEGATRON_CORE = True
 except (ImportError, ModuleNotFoundError):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py
new file mode 100644
index 000000000000..bcb904e7e4fc
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py
@@ -0,0 +1,309 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# just copy paste here, need work
+import logging
+from typing import Literal, Optional
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.gpt.gpt_embedding import GPTEmbedding
+from megatron.core.transformer.enums import AttnMaskType, ModelType
+from megatron.core.transformer.module import MegatronModule
+#from megatron.core.transformer.transformer_block import TransformerBlock
+from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_block import FalconTransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint
+
+
+class FalconGPTModel(MegatronModule):
+    """Transformer language model.
+
+    Arguments:
+        config (TransformerConfig): transformer config
+
+        vocab_size (int): vocabulary size
+
+        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
+
+        pre_process (bool): Include embedding layer (used with pipeline parallelism)
+        post_process (bool): Include an output layer (used with pipeline parallelism)
+
+        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
+
+        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are
+            shared. Defaults to False.
+
+        position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope'].
+            Defaults is 'learned_absolute'.
+
+        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
+            Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
+
+        seq_len_interpolation_factor (float): scale of linearly interpolating RoPE for longer sequences.
+            The value must be a float larger than 1.0. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
+        rotary_percent: float = 1.0,
+        seq_len_interpolation_factor: Optional[float] = None,
+    ):
+        super(GPTModel, self).__init__(config=config)
+
+        self.config: TransformerConfig = config
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+
+        # megatron core pipelining currently depends on model type
+        # TODO: remove this dependency ?
+        self.model_type = ModelType.encoder_or_decoder
+
+        # Embeddings.
+        if self.pre_process:
+            self.embedding = GPTEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                add_position_embedding=(self.position_embedding_type == 'learned_absolute'),
+            )
+
+        # Rotary Position Embeddings
+        if self.position_embedding_type == 'rope':
+            rotary_dim = self.config.kv_channels
+            if rotary_percent < 1.0:
+                rotary_dim = int(rotary_dim * rotary_percent)
+
+            self.rotary_pos_emb = RotaryEmbedding(rotary_dim, seq_len_interpolation_factor)
+        else:
+            self.rotary_pos_emb = None
+
+        # Transformer.
+        self.decoder = FalconTransformerBlock(
+            config=self.config,
+            self_attn_mask_type=AttnMaskType.causal,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+
+        # Output
+        if post_process:
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process
+                and self.share_embeddings_and_output_weights,
+            )
+
+        if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
+            self.initialize_last_stage_with_word_embeddings()
+
+    def set_input_tensor(self, input_tensor):
+        """ See megatron.model.transformer.set_input_tensor()"""
+
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt'
+        self.decoder.set_input_tensor(input_tensor[0])
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        attention_mask: Tensor,
+        decoder_input: Tensor = None,
+        labels: Tensor = None,
+        inference_params=None,
+    ):
+        # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
+        # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
+
+        # Decoder embedding.
+        if decoder_input is not None:
+            pass
+        elif self.pre_process:
+            decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
+        else:
+            # intermediate stage of pipeline
+            # decoder will get hidden_states from encoder.input_tensor
+            decoder_input = None
+
+        # Rotary positional embeddings
+        rotary_pos_emb = None
+        if self.rotary_pos_emb is not None:
+            if inference_params is not None:
+                rotary_seq_len = inference_params.max_sequence_length
+            else:
+                if self.decoder.input_tensor is not None:
+                    rotary_seq_len = self.decoder.input_tensor.size(0)
+                else:
+                    rotary_seq_len = decoder_input.size(0)
+
+                # Decoder input is split along sequence dimension, but RoPE is applied in tensor parallel region
+                if self.config.sequence_parallel:
+                    rotary_seq_len *= self.config.tensor_model_parallel_size
+
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        # Run decoder.
+        hidden_states = self.decoder(
+            hidden_states=decoder_input,
+            attention_mask=attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+
+        if not self.post_process:
+            return hidden_states
+
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+        logits, _ = self.output_layer(hidden_states, weight=output_weight)
+
+        if labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous()
+
+        # [b s] => [s b]
+        labels = labels.transpose(0, 1).contiguous()
+        loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels)
+
+        # [s b] => [b, s]
+        loss = loss.transpose(0, 1).contiguous()
+        return loss
+
+    def shared_embedding_or_output_weight(self):
+        if self.pre_process:
+            return self.embedding.word_embeddings.weight
+        elif self.post_process:
+            return self.output_layer.weight
+        return None
+
+    def initialize_last_stage_with_word_embeddings(self):
+
+        # This function just initializes the word embeddings in the final stage
+        # when we are using pipeline parallelism and sharing word
+        # embeddings. Nothing to do if we aren't sharing weights or aren't using
+        # pipeline parallelism.
+        if not self.share_embeddings_and_output_weights or (self.pre_process and self.post_process):
+            return
+
+        if self.post_process and not self.pre_process:
+            assert not parallel_state.is_pipeline_first_stage()
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.output_layer.weight.data.fill_(0)
+            self.output_layer.weight.shared = True
+
+        # Parameters are shared between the word embeddings layers, and the
+        # heads at the end of the model. In a pipelined setup with more than
+        # one stage, the initial embedding layer and the head are on different
+        # workers, so we do the following:
+        # 1. Create a second copy of word_embeddings on the last stage, with
+        #    initial parameters of 0.0.
+        # 2. Do an all-reduce between the first and last stage to ensure that
+        #    the two copies of word_embeddings start off with the same
+        #    parameter values.
+        # 3. In the training loop, before an all-reduce between the grads of
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
+
+        # Ensure that first and last stages have the same initial parameter
+        # values.
+        if torch.distributed.is_initialized():
+            if parallel_state.is_rank_in_embedding_group():
+                weight = self.shared_embedding_or_output_weight()
+                torch.distributed.all_reduce(
+                    weight.data, group=parallel_state.get_embedding_group()
+                )
+
+        elif not getattr(GPTModel, "embedding_warning_printed", False):
+            logging.getLogger(__name__).warning(
+                "Distributed processes aren't initialized, so the output layer "
+                "is not initialized with weights from the word embeddings. "
+                "If you are just manipulating a model this is fine, but "
+                "this needs to be handled manually. If you are training "
+                "something is definitely wrong."
+            )
+            GPTModel.embedding_warning_printed = True
+
+    def sharded_state_dict(self, prefix=''):
+        sharded_state_dict = {}
+
+        if self.pre_process:
+            embedding_prefix = f'{prefix}embedding.'
+            embedding_sharded_state_dict = self.embedding.sharded_state_dict(
+                prefix=embedding_prefix
+            )
+            sharded_state_dict.update(embedding_sharded_state_dict)
+
+        decoder_prefix = f'{prefix}decoder.'
+        decoder_sharded_state_dict = self.decoder.sharded_state_dict(prefix=decoder_prefix)
+        sharded_state_dict.update(decoder_sharded_state_dict)
+
+        if self.post_process:
+            output_layer_prefix = f'{prefix}output_layer.'
+            output_layer_key = f'{output_layer_prefix}weight'
+            if self.share_embeddings_and_output_weights:
+                if not self.pre_process:
+                    # when sharing embeddings with last stage, we need to use the weights from the first stage
+                    # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight
+                    tensor = self.shared_embedding_or_output_weight()
+                    first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
+                    dp_rank = parallel_state.get_data_parallel_rank()
+                    dp_size = parallel_state.get_data_parallel_world_size()
+                    last_stage_word_emb_replica_id = (
+                        dp_rank + dp_size
+                    )  # copy of first stage embedding
+
+                    sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
+                        tensor=tensor,
+                        key=first_stage_word_emb_key,
+                        replica_id=last_stage_word_emb_replica_id,
+                        allow_shape_mismatch=True,
+                    )
+
+                    sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
+
+            else:
+                output_layer_state_dict = self.output_layer.state_dict(
+                    prefix=output_layer_prefix, keep_vars=True
+                )
+                output_layer_tensor = output_layer_state_dict[output_layer_key]
+                # independent output layer
+                sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
+                    tensor=output_layer_tensor,
+                    key=output_layer_key,
+                    replica_id=parallel_state.get_data_parallel_rank(),
+                    allow_shape_mismatch=True,
+                )
+
+                sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
+
+        return sharded_state_dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py
new file mode 100644
index 000000000000..30a80782b5ea
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py
@@ -0,0 +1,287 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import re
+from contextlib import nullcontext
+
+import torch
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+from megatron.core.transformer.custom_layers.transformer_engine import TENorm
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+# change import FalconTransformerLayer
+from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_layer import FalconTransformerLayer
+from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor
+
+
+class FalconTransformerBlock(MegatronModule):
+    """Transformer class."""
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        self_attn_mask_type=AttnMaskType.padding,
+        post_layer_norm=True,
+        pre_process=True,
+        post_process=True,
+    ):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        self.self_attn_mask_type = self_attn_mask_type
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        # required for pipeline parallel schedules
+        self.input_tensor = None
+
+        self.checkpoint_core_attention = self.config.recompute_granularity == 'selective'
+
+        self.num_layers_per_pipeline_rank = (
+            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
+        )
+
+        self._build_layers()
+
+    def _build_layers(self):
+        # Transformer layers.
+        # @jcasper can we improve how we deal with layer_number?
+        # currently it's only used in CoreAttention?
+        # if self.apply_query_key_layer_scaling:
+        #     coeff = self.layer_number
+        #     self.norm_factor *= coeff
+        def build_layer(layer_number):
+            layer = FalconTransformerLayer(
+                config=self.config,
+                layer_number=layer_number,
+                self_attn_mask_type=self.self_attn_mask_type,
+            )
+            return layer
+
+        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+            # Interleaved pipeline parallelism:
+            # Number of layers in each model chunk is the number of layers in the stage,
+            # divided by the number of model chunks in a stage.
+            # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0]  [2]  [4]  [6]
+            # Stage 1: [1]  [3]  [5]  [7]
+            # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0, 1]  [4, 5]
+            # Stage 1: [2, 3]  [6, 7]
+
+            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+            num_layers_per_virtual_rank = self.num_layers_per_pipeline_rank // vp_size
+
+            num_layers_to_build = num_layers_per_virtual_rank
+
+        else:
+            # Non-interleaved pipeline parallelism:
+            # Each stage gets a contiguous set of layers.
+
+            num_layers_to_build = self.num_layers_per_pipeline_rank
+
+        # offset is implicit in TransformerLayer
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(num_layers_to_build)])
+
+        # # TODO: add back standalone_embedding_stage
+        # if self.num_layers == 0:
+        #     # When a standalone embedding stage is used (e.g.,
+        #     # args.standalone_embedding_stage == True), virtual pipeline ranks
+        #     # on pipeline rank 0 will have zero transformer layers assigned to
+        #     # them. This results in the model's input and output tensors to be
+        #     # the same, which will cause failure for certain output tensor
+        #     # optimizations (e.g., pipeline output deallocation). To remedy
+        #     # this, we assign a 'no-op' layer on these ranks, which will
+        #     # disconnect the input tensor from the output tensor.
+        #     self.num_layers = 1
+        #     self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)])
+        # else:
+        #     self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)])
+
+        if self.post_process and self.post_layer_norm:
+            # Final layer norm before output.
+            self.final_layernorm = TENorm(
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+                persist_layer_norm=self.config.persist_layer_norm,
+                sequence_parallel=self.config.sequence_parallel,
+                zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+                normalization=self.config.normalization,
+            )
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def _checkpointed_forward(self, hidden_states, attention_mask, rotary_pos_emb):
+        """Forward method with activation checkpointing."""
+
+        def custom(start, end):
+            def custom_forward(*args, **kwargs):
+                x_, *args = args
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+                    x_ = layer(x_, *args, **kwargs)
+                return x_
+
+            return custom_forward
+
+        if self.config.recompute_method == 'uniform':
+            # Uniformly divide the total number of Transformer layers and checkpoint
+            # the input activation of each divided chunk.
+            # A method to further reduce memory usage reducing checkpoints.
+            l = 0
+            while l < self.num_layers_per_pipeline_rank:
+                hidden_states = tensor_parallel.checkpoint(
+                    custom(l, l + self.config.recompute_num_layers),
+                    self.config.distribute_saved_activations,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                )
+
+                l += self.config.recompute_num_layers
+
+        elif self.config.recompute_method == 'block':
+            # Checkpoint the input activation of only a set number of individual
+            # Transformer layers and skip the rest.
+            # A method fully use the device memory removing redundant re-computation.
+            for l in range(self.num_layers_per_pipeline_rank):
+                if l < self.config.recompute_num_layers:
+                    hidden_states = tensor_parallel.checkpoint(
+                        custom(l, l + 1),
+                        self.config.distribute_saved_activations,
+                        hidden_states,
+                        attention_mask,
+                        rotary_pos_emb,
+                    )
+                else:
+                    hidden_states = custom(l, l + 1)(hidden_states, attention_mask, rotary_pos_emb)
+        else:
+            raise ValueError("Invalid activation recompute method.")
+
+        return hidden_states
+
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def forward(self, hidden_states, attention_mask, inference_params=None, rotary_pos_emb=None):
+        # hidden_states (float): [s, b, h]
+        # attention_mask (bool): [1, 1, s, s]
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = make_viewless_tensor(
+            inp=hidden_states, requires_grad=True, keep_graph=True,
+        )
+
+        if self.config.sequence_parallel:
+            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        if self.config.fp8:
+            import transformer_engine  # To keep out TE dependency when not training in fp8
+
+            if self.config.fp8 == "e4m3":
+                fp8_format = transformer_engine.common.recipe.Format.E4M3
+            elif self.config.fp8 == "hybrid":
+                fp8_format = transformer_engine.common.recipe.Format.HYBRID
+            else:
+                raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
+
+            fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
+                margin=self.config.fp8_margin,
+                interval=self.config.fp8_interval,
+                fp8_format=fp8_format,
+                amax_compute_algo=self.config.fp8_amax_compute_algo,
+                amax_history_len=self.config.fp8_amax_history_len,
+                override_linear_precision=(False, False, not self.config.fp8_wgrad),
+            )
+            fp8_group = None
+            if parallel_state.model_parallel_is_initialized():
+                fp8_group = parallel_state.get_amax_reduction_group()
+            fp8_context = transformer_engine.pytorch.fp8_autocast(
+                enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group
+            )
+        else:
+            fp8_context = nullcontext()
+
+        with rng_context and fp8_context:
+            # Forward pass.
+            if self.config.recompute_granularity == 'full':
+                hidden_states = self._checkpointed_forward(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    rotary_pos_emb=rotary_pos_emb,
+                )
+            else:
+                for layer in self.layers:
+                    hidden_states = layer(
+                        hidden_states=hidden_states,
+                        attention_mask=attention_mask,
+                        rotary_pos_emb=rotary_pos_emb,
+                        inference_params=inference_params,
+                    )
+
+        # Final layer norm.
+        if self.post_process and self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+    def sharded_state_dict(self, prefix=''):
+
+        sharded_state_dict = {}
+
+        layer_prefix = f'{prefix}layers.'
+        for layer in self.layers:
+            sharded_state_dict.update(layer.sharded_state_dict(prefix=layer_prefix))
+
+        if self.post_process and self.post_layer_norm:
+            state_dict = self.state_dict(keep_vars=True)
+
+            tensor = state_dict['final_layernorm.weight']
+            layer_name = f'{prefix}final_layernorm.weight'
+            sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name)
+
+            # RMSNorm doesn't have bias.
+            if 'final_layernorm.bias' in state_dict.keys():
+                tensor = state_dict['final_layernorm.bias']
+                layer_name = f'{prefix}final_layernorm.bias'
+                sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(
+                    tensor, layer_name
+                )
+
+        return sharded_state_dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py
new file mode 100644
index 000000000000..cb980dad1b5f
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# just copy paste here, need work
+from dataclasses import dataclass
+from typing import Callable
+
+import torch
+import torch.nn.functional as F
+
+from megatron.core import ModelParallelConfig
+from megatron.core.utils import init_method_normal, scaled_init_method_normal
+
+
+@dataclass
+class TransformerConfig(ModelParallelConfig):
+    """Configuration object for megatron-core transformers.
+
+        Attributes:
+
+        # model architecture
+        num_layers (int): Number of transformer layers in a transformer block.
+        hidden_size (int): Transformer hidden size.
+        ffn_hidden_size (int): Transformer Feed-Forward Network hidden size.
+                                This is set to 4*hidden_size if not provided. Defaults to None.')
+        num_attention_heads (int): Number of transformer attention heads.
+        kv_channels (int): Projection weights dimension in multi-head attention.
+                            This is set to hidden_size // num_attention_heads if not provided.
+                            Defaults to None.
+        num_query_groups (int): Number of query groups for group query attention. If None, normal attention is used.
+
+        hidden_dropout (float): Dropout probability for transformer hidden state. Defaults to 0.1.
+        attention_dropout (float): Post attention dropout probability. Defaults to 0.1.
+        fp32_residual_connection (bool): If true, move residual connections to fp32.
+        apply_residual_connection_post_layernorm (bool): If true, uses the original BERT residule connection ordering.
+                                                         Defaults to False.
+        layernorm_epsilon (float): Layernorm epsilon. Defaults to 1e-5.
+
+        layernorm_zero_centered_gamma (bool): if set to 'True', the LayerNorm is adjusted to center the gamma values
+                                              around 0. This improves numerical stability. Defaults to False.
+
+        add_bias_linear (bool): Include a bias term in all linear layers (QKV projections, after core attention, and two
+                                in MLP layer). Default is True.
+
+        gated_linear_unit (bool): Use a gated linear unit for the first linear layer in the MLP. Defaults to False.
+
+        activation_func (Callable): Activation function to use for the non-linearity in the MLP. Defaults to F.gelu.
+
+        # initialization
+        init_method (Callable): Method to initialize weights. Note that bias is always set to
+                                zero. Should be a function that takes a single Tensor and
+                                initializes it. Defaults to
+                                megatron.core.utils.init_method_normal(init_method_std) which is
+                                torch.nn.init.normal_ with mean=0.0 and std=init_method_Std.
+
+        output_layer_init_method (Callable): Method to initialize weights of the output layer of
+                                             both attention and MLP blocks. Defaults to
+                                             megatron.core.utils.scaled_init_method_normal(init_method_std)
+                                             which is torch.nn.init.normal_ with mean=0.0 and
+                                             std=init_method_std / math.sqrt(2.0 * num_layers).
+
+        init_method_std (float): Standard deviation of the zero mean normal for the default
+                                 initialization method, not used if init_method and
+                                 output_layer_init_method are provided. Defaults to 0.02.
+
+        # mixed-precision
+        apply_query_key_layer_scaling (bool): If true, scale Q * K^T by 1 / layer-number. Defaults to True.
+        attention_softmax_in_fp32 (bool): If true, run attention masking and softmax in fp32.
+                                          This should be true if apply_query_key_layer_scaling is true.
+
+        # fusion
+        bias_gelu_fustion (bool): If true, fuses bias and gelu. Defaults to False.
+        masked_softmax_fusion (bool): If true, uses softmax fusion.
+        persist_layer_norm (bool): If true, uses the persistent fused layer norm kernel.
+                                   This kernel only supports a fixed set of hidden sizes.
+                                   Defaults to False.
+        bias_dropout_fusion (bool): If true, uses bias dropout fusion.
+
+        # activation recomputation
+
+        recompute_granularity (str): megatron-core supports 'selective' activation checkpointing where only the memory
+                                     intensive part of attention is checkpointed.  These memory intensive activations
+                                     are also less compute intensive which makes activation checkpointing more efficient
+                                     for LLMs (20B+).  See Reducing Activation Recomputation in Large Transformer
+                                     Models: https://arxiv.org/abs/2205.05198 for more details.  'full' will checkpoint
+                                     the entire transformer layer.  Must be 'selective' or 'full'. 'selective' always uses all layers.
+                                     Defaults to None.
+
+        recompute_method (str): uniform will uniformly divide the total number of transformer layers in a transformer
+                                block and recompute the input activation of each divided chunk at the specified
+                                granularity.  block will recompute the input activations for only a set number of
+                                transformer layers per pipeline stage.  The rest of the layers in the pipeline stage
+                                will not have any activations recomputed.  Must be 'uniform' or 'block'. Defaults to
+                                None.
+
+        recompute_num_layers (int): When recompute_method is uniform, recompute_num_layers is the number of transformer
+                                    layers in each uniformly divided recompute unit.  When recompute_method is block,
+                                    recompute_num_layers is the number of transformer layers to recompute within each
+                                    pipeline stage.  Must be None for 'selective' activation checkpointing. Defaults to None.
+
+        distribute_saved_activations (bool): If true, distribute recomputed activations across the model parallel
+                                             group. Defaults to None.
+
+        # fp8 related (via Transformer Engine). For detailed info, refer the the Transformer Engine docs at
+        # https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html
+
+        fp8 (str): If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined choices: (1) 'e4m3'
+                   uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8 activation and weight tensors and
+                   e5m2 for all FP8 output activation gradient tensors. Defaults to None.
+
+        fp8_margin (int): Margin for the scaling factor computation.
+
+        fp8_interval (int): Controls how often the scaling factor is recomputed.
+
+        fp8_amax_history_len (int): The length of the amax history window used for scaling factor computation.
+
+        fp8_amax_compute_algo (str): Algorithm used for choosing the `amax` value for the scaling factor computation.
+                                     There are 2 predefined choices: `max` chooses the largest `amax` in the history
+                                     window, while `most_recent` always chooses the most recently seen value.
+
+        fp8_wgrad (bool): When set to False, override FP8 config options and do the wgrad computation in higher precision.
+                          Defaults to True.
+
+        # Experimental
+        normalization (str): Swtich b/w `LayerNorm` and `RMSNorm` as normalization layers. For now, these are primarily
+                             used by Transformer-Engine's layers like `LayerNormLinear`. Default value is `LayerNorm`.
+
+
+    """
+
+    # model architecture
+    num_layers: int = 0
+    hidden_size: int = 0
+    num_attention_heads: int = 0
+    num_query_groups: int = None
+
+    ffn_hidden_size: int = None
+    kv_channels: int = None
+    hidden_dropout: float = 0.1
+    attention_dropout: float = 0.1
+    fp32_residual_connection: bool = False
+    # @jcasper should we keep this option?
+    apply_residual_connection_post_layernorm: bool = False
+    layernorm_epsilon: float = 1e-5
+    layernorm_zero_centered_gamma: bool = False
+    add_bias_linear: bool = True
+    gated_linear_unit: bool = False
+    activation_func: Callable = F.gelu
+
+    # initialization
+    init_method: Callable = None
+    output_layer_init_method: Callable = None
+    init_method_std: float = 0.02
+
+    # mixed-precision
+    apply_query_key_layer_scaling: bool = True
+    attention_softmax_in_fp32: bool = True
+
+    # communication
+
+    # fusion
+    bias_gelu_fusion: bool = False  # TODO: this should be bias_activation_fusion ?
+    masked_softmax_fusion: bool = False
+    persist_layer_norm: bool = False
+    bias_dropout_fusion: bool = False  # TODO: this should be bias_dropout_add_fusion?
+
+    # activation recomputation
+    recompute_granularity: str = None
+    recompute_method: str = None
+    recompute_num_layers: int = None
+    distribute_saved_activations: bool = None
+
+    # fp8 related
+    fp8: str = None
+    fp8_margin: int = 0
+    fp8_interval: int = 1
+    fp8_amax_history_len: int = 1
+    fp8_amax_compute_algo: str = "most_recent"
+    fp8_wgrad: bool = True
+
+    # experimental section (TODO: move to apt. section above once stable)
+    normalization: bool = "LayerNorm"  # alt value supported by TE: "RMSNorm"
+
+    def __post_init__(self):
+        """ Python dataclass method that is used to modify attributes after initialization.
+            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        """
+        super().__post_init__()
+        if self.fp16 and self.bf16:
+            raise ValueError(
+                f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.'
+            )
+
+        if self.num_attention_heads % self.tensor_model_parallel_size != 0:
+            raise ValueError(
+                f"num_attention_heads ({self.num_attention_heads}) must be a multiple of "
+                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
+            )
+
+        if self.ffn_hidden_size is None:
+            self.ffn_hidden_size = 4 * self.hidden_size
+
+        if self.kv_channels is None:
+            self.kv_channels = self.hidden_size // self.num_attention_heads
+
+        if self.num_query_groups is None:
+            self.num_query_groups = self.num_attention_heads
+
+        if self.num_query_groups % self.tensor_model_parallel_size != 0:
+            raise ValueError(
+                f"num_query_groups ({self.num_query_groups}) must be a multiple of "
+                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
+            )
+
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+
+        if self.recompute_granularity is not None:
+            if not self.recompute_granularity in ['full', 'selective']:
+                raise ValueError(
+                    f'When using recompute_granuarlity: {self.recompute_granularity} must be "full" or "selective".'
+                )
+
+            if self.recompute_method is not None:
+                if not self.recompute_method in ['block', 'uniform']:
+                    raise ValueError(
+                        f'recompute_method: {self.recompute_method} must be "block" or "uniform".'
+                    )
+            elif self.recompute_granularity != 'selective':
+                raise ValueError(
+                    f'Using recompute_granularity: {self.recompute_granularity} so recompute_method must be "block" or "uniform"'
+                )
+
+            if self.recompute_granularity != 'selective' and self.recompute_num_layers is None:
+                raise ValueError(
+                    f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be between '
+                    f'1 and num_layers_per_pipeline_rank: {self.num_layers // self.pipeline_model_parallel_size}'
+                )
+            elif (
+                self.recompute_granularity == 'selective' and self.recompute_num_layers is not None
+            ):
+                raise ValueError(
+                    f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be None.'
+                )
+
+            if self.distribute_saved_activations and self.sequence_parallel:
+                raise ValueError(
+                    f'distribute_saved_activations: {self.distribute_saved_activations} must be false when sequence parallel is enabled: {self.sequence_parallel}'
+                )
+
+            if self.virtual_pipeline_model_parallel_size is not None:
+                if not self.num_layers % self.virtual_pipeline_model_parallel_size == 0:
+                    raise ValueError(
+                        f'num_layers: {self.num_layers} must be divisible by virtual_model_parallel_size {self.virtual_pipeline_model_parallel_size}'
+                    )
+
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+
+        if self.bias_gelu_fusion:
+            if not self.add_bias_linear:
+                raise ValueError(
+                    "When bias_gelu_fusion is True, add_bias_linear must also be True."
+                )
+
+            if self.activation_func != F.gelu:
+                raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.')
+
+        if self.init_method is None:
+            self.init_method = init_method_normal(self.init_method_std)
+
+        if self.output_layer_init_method is None:
+            self.output_layer_init_method = scaled_init_method_normal(
+                self.init_method_std, self.num_layers
+            )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py
new file mode 100644
index 000000000000..9bd40a84376f
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py
@@ -0,0 +1,271 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import re
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+#from megatron.core.transformer.attention import SelfAttention
+# change attention due to extra layernorm before mlp, ln_mlp.
+from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_attention import SelfAttention
+from megatron.core.transformer.custom_layers.transformer_engine import TENorm
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_viewless_tensor
+
+""" We use the following notation throughout this file:
+     h: hidden size
+     n: number of attention heads
+     p: number of model parallel partitions
+     np: n/p
+     hp: h/p
+     hn: h/n
+     b: batch size
+     s: sequence length
+     l: number of layers
+    Transformer takes input of size [s, b, h] and returns a
+    tensor of the same size. We use the following arguments:
+        hyperparameters: transformer hyperparameters
+"""
+
+class FalconTransformerLayer(MegatronModule):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    
+    Args: 
+        new_decoder_architecture (bool):
+    Whether to use Falcon's new decoder architecture that were used in 7B/40B/180B variants.
+        
+        parallel_attention (bool):
+    Whether to use parallel attention, which computes attention in parallel with feed forward layer.
+        
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        layer_number: int = 1,
+        self_attn_mask_type=AttnMaskType.padding,
+        parallel_attention=False,
+        new_decoder_architecture=False,
+    ):
+        super().__init__(config=config)
+        self.config: TransformerConfig = config
+
+        self.layer_number = layer_number + self._get_layer_offset()
+
+        self.self_attn_mask_type = self_attn_mask_type
+        
+        self.new_decoder_architecture = new_decoder_architecture
+        self.parallel_attention = parallel_attention
+        
+        # Layernorm on the input data.
+        # TODO: add pytorch only layernorm
+        self.input_layernorm = self._create_identity_op()
+        
+        self.mlp_layernorm = self._create_identity_op() if self.new_decoder_architecture else None
+        
+        if self.new_decoder_architecture or self.parallel_attention:
+            self.post_self_attn_layernorm = None
+        else:
+            # Layernorm on the attention output
+            self.post_self_attn_layernorm = self._create_identity_op()
+
+        # Self attention.
+        self.self_attention = SelfAttention(
+            config=self.config, layer_number=layer_number, attn_mask_type=self_attn_mask_type,
+        )
+
+        # MLP
+        self.mlp = MLP(config=self.config)
+
+        # @jcasper how should we handle nvfuser?
+        # Set bias+dropout+add fusion grad_enable execution handler.
+        # TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        # TORCH_MINOR = int(torch.__version__.split('.')[1])
+        # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
+        # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
+        self.bias_dropout_add_exec_handler = torch.enable_grad
+        
+    def _create_identity_op(self):
+        """Helper function to create an IdentityOp with common parameters."""
+        return IdentityOp(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+            persist_layer_norm=self.config.persist_layer_norm,
+            sequence_parallel=self.config.sequence_parallel,
+            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+            normalization=self.config.normalization,
+        )
+
+    def _get_layer_offset(self):
+
+        pipeline_rank = parallel_state.get_pipeline_model_parallel_rank()
+
+        num_layers_per_pipeline_rank = (
+            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
+        )
+
+        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+            vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
+            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+            total_num_layers = self.config.num_layers
+            num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
+            total_virtual_chunks = total_num_layers // vp_size
+            offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank)
+
+        else:
+            # Each stage gets a contiguous set of layers.
+            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+                offset = pipeline_rank * num_layers_per_pipeline_rank
+            else:
+                offset = 0
+
+        return offset
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_output=None,
+        enc_dec_attn_mask=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+    ):
+        # hidden_states: [s, b, h]
+        
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        input_mlp_ln = layernorm_output
+        
+        # Self attention.
+        attention_output_with_bias = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+                 
+        # Residual connection.
+        if self.config.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+        
+        # falcon specific 
+        if self.new_decoder_architecture:
+            mlp_ln_output = self.mlp_layernorm(hidden_states)
+
+        bias_dropout_add_func = get_bias_dropout_add(self.training, self.config.bias_dropout_fusion)
+
+        # bias_dropout_add fusion returning fp32 instead of bf16
+        with self.bias_dropout_add_exec_handler():
+            layernorm_input = bias_dropout_add_func(
+                attention_output_with_bias, residual, self.config.hidden_dropout
+            )
+
+        # falcon specific
+        if not self.new_decoder_architecture:
+            if self.parallel_attention:
+                layernorm_output = input_mlp_ln
+            else:
+                layernorm_output = self.post_self_attn_layernorm(layernorm_input)
+                residual = layernorm_input if not self.config.apply_residual_connection_post_layernorm else layernorm_output
+        else:
+            layernorm_output = mlp_ln_output
+
+        # MLP.
+        mlp_output_with_bias = self.mlp(layernorm_output)
+
+        # falcon specific:
+        if self.new_decoder_architecture or self.parallel_attention:
+            mlp_output_with_bias = mlp_output_with_bias + attention_output_with_bias
+            
+        with self.bias_dropout_add_exec_handler():
+            output = bias_dropout_add_func(
+                mlp_output_with_bias, residual, self.config.hidden_dropout
+            )
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(
+            inp=output, requires_grad=output.requires_grad, keep_graph=True
+        )
+
+        return output
+
+    def sharded_state_dict(self, prefix=''):
+
+        # state_dict = self.state_dict(prefix=prefix, keep_vars=True)
+        state_dict = self.state_dict(keep_vars=True)
+
+        tensor_parallel_layers_axis_map = {
+            'self_attention.linear_qkv.weight': 0,
+            'self_attention.linear_qkv.bias': 0,
+            'self_attention.linear_proj.weight': 1,
+            'mlp.linear_fc1.weight': 0,
+            'mlp.linear_fc1.bias': 0,
+            'mlp.linear_fc2.weight': 1,
+        }
+
+        offset = self._get_layer_offset()
+        num_layers = self.config.num_layers
+
+        sharded_state_dict = {}
+
+        for layer_name in state_dict.keys():
+            tensor = state_dict[layer_name]
+            global_layer_offset = self.layer_number - 1  # self.layer_number starts at 1
+            layer_key = f'{prefix}{global_layer_offset - offset}.{layer_name}'  # module list index in TransformerBlock
+            sharded_offsets = [(0, global_layer_offset, num_layers)]  # PP sharding
+
+            if layer_name in tensor_parallel_layers_axis_map:
+                tp_axis = tensor_parallel_layers_axis_map[layer_name]
+                # TP sharding
+                sharded_offsets.append(
+                    [
+                        tp_axis + 1,  # +1 for PP dimension
+                        parallel_state.get_tensor_model_parallel_rank(),
+                        parallel_state.get_tensor_model_parallel_world_size(),
+                    ]
+                )
+                replica_id = parallel_state.get_data_parallel_rank()
+            else:
+                replica_id = (
+                    parallel_state.get_data_parallel_rank()
+                    * parallel_state.get_data_parallel_world_size()
+                    + parallel_state.get_tensor_model_parallel_rank()
+                )
+
+            if layer_name.endswith('._extra_state'):
+                sharded_state_dict[layer_key] = ShardedObject(
+                    f'{prefix}{layer_name}',
+                    tensor,
+                    (num_layers,),
+                    (global_layer_offset,),
+                    replica_id,
+                )
+
+            else:
+                sharded_state_dict[layer_key] = ShardedTensor.from_rank_offsets(
+                    f'{prefix}{layer_name}',
+                    tensor,
+                    *sharded_offsets,
+                    replica_id=replica_id,
+                    prepend_axis_num=1,  # for PP sharding
+                )
+
+        return sharded_state_dict

From 36fe312249bb2938df6013b2743b8997c1bc1ab8 Mon Sep 17 00:00:00 2001
From: vivian <xuanzic@nvidia.com>
Date: Tue, 19 Sep 2023 21:06:10 +0000
Subject: [PATCH 07/69] fix for new decoder architecture

---
 .../convert_hf_falcon_to_nemo.py              | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index b4f98d639a5e..aff5901b3f88 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -196,12 +196,8 @@ def load_nemo_config(args):
     nemo_config.mcore_gpt = True
     nemo_config.transformer_engine = True
     nemo_config.bias_activation_fusion = False
-<<<<<<< HEAD
     nemo_config.bias_dropout_add_fusion = False
     
-=======
-
->>>>>>> 8297b5ccb9b4c489e25316777cf8181f24712ebe
     base = 128
     while falcon_config.vocab_size % base != 0:
         base //= 2
@@ -312,19 +308,8 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
         add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2')
 
         if falcon_config.new_decoder_architecture:
-<<<<<<< HEAD
             add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
             add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True)
-=======
-            add_weight_and_possible_bias(
-                f'{prefix}.ln_attn',
-                f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm',
-                is_layernorm=True,
-            )
-            add_weight_and_possible_bias(
-                f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True
-            )
->>>>>>> 8297b5ccb9b4c489e25316777cf8181f24712ebe
         else:
             add_weight_and_possible_bias(
                 f'{prefix}.input_layernorm',
@@ -355,13 +340,8 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
 
     del model
 
-<<<<<<< HEAD
     #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
     model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer)
-=======
-    # model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
-    model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], strict=False, trainer=trainer)
->>>>>>> 8297b5ccb9b4c489e25316777cf8181f24712ebe
 
     model._save_restore_connector = NLPSaveRestoreConnector()
 

From 044026d809c7fe23931df63b3a08f6de292e2c61 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 21:07:50 +0000
Subject: [PATCH 08/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../language_modeling/megatron/__init__.py    |  2 +-
 .../megatron/falcon_mcore/falcon_gpt_model.py | 29 ++++-----
 .../falcon_mcore/falcon_transformer_block.py  | 24 +++-----
 .../falcon_mcore/falcon_transformer_config.py | 20 ++----
 .../falcon_mcore/falcon_transformer_layer.py  | 61 ++++++++-----------
 .../convert_hf_falcon_to_nemo.py              | 14 +++--
 6 files changed, 61 insertions(+), 89 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
index 0a7cab62e240..5dec9388528e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
@@ -15,8 +15,8 @@
 # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel
 
 try:
-    from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
     from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_gpt_model import FalconGPTModel
+    from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 
     HAVE_MEGATRON_CORE = True
 except (ImportError, ModuleNotFoundError):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py
index bcb904e7e4fc..b76b3d8828ee 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py
@@ -4,17 +4,19 @@
 from typing import Literal, Optional
 
 import torch
-from torch import Tensor
-
 from megatron.core import parallel_state, tensor_parallel
 from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding
 from megatron.core.models.gpt.gpt_embedding import GPTEmbedding
 from megatron.core.transformer.enums import AttnMaskType, ModelType
 from megatron.core.transformer.module import MegatronModule
-#from megatron.core.transformer.transformer_block import TransformerBlock
-from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_block import FalconTransformerBlock
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint
+from torch import Tensor
+
+# from megatron.core.transformer.transformer_block import TransformerBlock
+from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_block import (
+    FalconTransformerBlock,
+)
 
 
 class FalconGPTModel(MegatronModule):
@@ -112,8 +114,7 @@ def __init__(
                 bias=False,
                 skip_bias_add=False,
                 gather_output=not self.parallel_output,
-                skip_weight_param_allocation=self.pre_process
-                and self.share_embeddings_and_output_weights,
+                skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights,
             )
 
         if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
@@ -239,9 +240,7 @@ def initialize_last_stage_with_word_embeddings(self):
         if torch.distributed.is_initialized():
             if parallel_state.is_rank_in_embedding_group():
                 weight = self.shared_embedding_or_output_weight()
-                torch.distributed.all_reduce(
-                    weight.data, group=parallel_state.get_embedding_group()
-                )
+                torch.distributed.all_reduce(weight.data, group=parallel_state.get_embedding_group())
 
         elif not getattr(GPTModel, "embedding_warning_printed", False):
             logging.getLogger(__name__).warning(
@@ -258,9 +257,7 @@ def sharded_state_dict(self, prefix=''):
 
         if self.pre_process:
             embedding_prefix = f'{prefix}embedding.'
-            embedding_sharded_state_dict = self.embedding.sharded_state_dict(
-                prefix=embedding_prefix
-            )
+            embedding_sharded_state_dict = self.embedding.sharded_state_dict(prefix=embedding_prefix)
             sharded_state_dict.update(embedding_sharded_state_dict)
 
         decoder_prefix = f'{prefix}decoder.'
@@ -278,9 +275,7 @@ def sharded_state_dict(self, prefix=''):
                     first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
                     dp_rank = parallel_state.get_data_parallel_rank()
                     dp_size = parallel_state.get_data_parallel_world_size()
-                    last_stage_word_emb_replica_id = (
-                        dp_rank + dp_size
-                    )  # copy of first stage embedding
+                    last_stage_word_emb_replica_id = dp_rank + dp_size  # copy of first stage embedding
 
                     sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
                         tensor=tensor,
@@ -292,9 +287,7 @@ def sharded_state_dict(self, prefix=''):
                     sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
 
             else:
-                output_layer_state_dict = self.output_layer.state_dict(
-                    prefix=output_layer_prefix, keep_vars=True
-                )
+                output_layer_state_dict = self.output_layer.state_dict(prefix=output_layer_prefix, keep_vars=True)
                 output_layer_tensor = output_layer_state_dict[output_layer_key]
                 # independent output layer
                 sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py
index 30a80782b5ea..d3b9ae63e4ac 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py
@@ -4,17 +4,19 @@
 from contextlib import nullcontext
 
 import torch
-
 from megatron.core import parallel_state, tensor_parallel
 from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
 from megatron.core.transformer.custom_layers.transformer_engine import TENorm
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.transformer_config import TransformerConfig
-# change import FalconTransformerLayer
-from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_layer import FalconTransformerLayer
 from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor
 
+# change import FalconTransformerLayer
+from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_layer import (
+    FalconTransformerLayer,
+)
+
 
 class FalconTransformerBlock(MegatronModule):
     """Transformer class."""
@@ -56,9 +58,7 @@ def _build_layers(self):
         #     self.norm_factor *= coeff
         def build_layer(layer_number):
             layer = FalconTransformerLayer(
-                config=self.config,
-                layer_number=layer_number,
-                self_attn_mask_type=self.self_attn_mask_type,
+                config=self.config, layer_number=layer_number, self_attn_mask_type=self.self_attn_mask_type,
             )
             return layer
 
@@ -202,9 +202,7 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p
         #   likely redundant, since p2p_communication.py (likely originator)
         #   already creates viewless tensors. That said, make_viewless_tensor()
         #   is called here to be future-proof and corner-case-proof.
-        hidden_states = make_viewless_tensor(
-            inp=hidden_states, requires_grad=True, keep_graph=True,
-        )
+        hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True,)
 
         if self.config.sequence_parallel:
             rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
@@ -242,9 +240,7 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p
             # Forward pass.
             if self.config.recompute_granularity == 'full':
                 hidden_states = self._checkpointed_forward(
-                    hidden_states=hidden_states,
-                    attention_mask=attention_mask,
-                    rotary_pos_emb=rotary_pos_emb,
+                    hidden_states=hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb,
                 )
             else:
                 for layer in self.layers:
@@ -280,8 +276,6 @@ def sharded_state_dict(self, prefix=''):
             if 'final_layernorm.bias' in state_dict.keys():
                 tensor = state_dict['final_layernorm.bias']
                 layer_name = f'{prefix}final_layernorm.bias'
-                sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(
-                    tensor, layer_name
-                )
+                sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name)
 
         return sharded_state_dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py
index cb980dad1b5f..e804a6228e70 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py
@@ -185,9 +185,7 @@ def __post_init__(self):
         """
         super().__post_init__()
         if self.fp16 and self.bf16:
-            raise ValueError(
-                f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.'
-            )
+            raise ValueError(f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.')
 
         if self.num_attention_heads % self.tensor_model_parallel_size != 0:
             raise ValueError(
@@ -221,9 +219,7 @@ def __post_init__(self):
 
             if self.recompute_method is not None:
                 if not self.recompute_method in ['block', 'uniform']:
-                    raise ValueError(
-                        f'recompute_method: {self.recompute_method} must be "block" or "uniform".'
-                    )
+                    raise ValueError(f'recompute_method: {self.recompute_method} must be "block" or "uniform".')
             elif self.recompute_granularity != 'selective':
                 raise ValueError(
                     f'Using recompute_granularity: {self.recompute_granularity} so recompute_method must be "block" or "uniform"'
@@ -234,9 +230,7 @@ def __post_init__(self):
                     f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be between '
                     f'1 and num_layers_per_pipeline_rank: {self.num_layers // self.pipeline_model_parallel_size}'
                 )
-            elif (
-                self.recompute_granularity == 'selective' and self.recompute_num_layers is not None
-            ):
+            elif self.recompute_granularity == 'selective' and self.recompute_num_layers is not None:
                 raise ValueError(
                     f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be None.'
                 )
@@ -257,9 +251,7 @@ def __post_init__(self):
 
         if self.bias_gelu_fusion:
             if not self.add_bias_linear:
-                raise ValueError(
-                    "When bias_gelu_fusion is True, add_bias_linear must also be True."
-                )
+                raise ValueError("When bias_gelu_fusion is True, add_bias_linear must also be True.")
 
             if self.activation_func != F.gelu:
                 raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.')
@@ -268,6 +260,4 @@ def __post_init__(self):
             self.init_method = init_method_normal(self.init_method_std)
 
         if self.output_layer_init_method is None:
-            self.output_layer_init_method = scaled_init_method_normal(
-                self.init_method_std, self.num_layers
-            )
+            self.output_layer_init_method = scaled_init_method_normal(self.init_method_std, self.num_layers)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py
index 9bd40a84376f..e592a1e0b1ac 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py
@@ -3,13 +3,9 @@
 import re
 
 import torch
-
 from megatron.core import parallel_state
 from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-#from megatron.core.transformer.attention import SelfAttention
-# change attention due to extra layernorm before mlp, ln_mlp.
-from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_attention import SelfAttention
 from megatron.core.transformer.custom_layers.transformer_engine import TENorm
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.identity_op import IdentityOp
@@ -18,6 +14,10 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import make_viewless_tensor
 
+# from megatron.core.transformer.attention import SelfAttention
+# change attention due to extra layernorm before mlp, ln_mlp.
+from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_attention import SelfAttention
+
 """ We use the following notation throughout this file:
      h: hidden size
      n: number of attention heads
@@ -33,6 +33,7 @@
         hyperparameters: transformer hyperparameters
 """
 
+
 class FalconTransformerLayer(MegatronModule):
     """A single transformer layer.
 
@@ -62,16 +63,16 @@ def __init__(
         self.layer_number = layer_number + self._get_layer_offset()
 
         self.self_attn_mask_type = self_attn_mask_type
-        
+
         self.new_decoder_architecture = new_decoder_architecture
         self.parallel_attention = parallel_attention
-        
+
         # Layernorm on the input data.
         # TODO: add pytorch only layernorm
         self.input_layernorm = self._create_identity_op()
-        
+
         self.mlp_layernorm = self._create_identity_op() if self.new_decoder_architecture else None
-        
+
         if self.new_decoder_architecture or self.parallel_attention:
             self.post_self_attn_layernorm = None
         else:
@@ -93,7 +94,7 @@ def __init__(
         # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
         # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
         self.bias_dropout_add_exec_handler = torch.enable_grad
-        
+
     def _create_identity_op(self):
         """Helper function to create an IdentityOp with common parameters."""
         return IdentityOp(
@@ -142,26 +143,23 @@ def forward(
         rotary_pos_emb=None,
     ):
         # hidden_states: [s, b, h]
-        
+
         # Layer norm at the beginning of the transformer layer.
         layernorm_output = self.input_layernorm(hidden_states)
         input_mlp_ln = layernorm_output
-        
+
         # Self attention.
         attention_output_with_bias = self.self_attention(
-            layernorm_output,
-            attention_mask,
-            inference_params=inference_params,
-            rotary_pos_emb=rotary_pos_emb,
+            layernorm_output, attention_mask, inference_params=inference_params, rotary_pos_emb=rotary_pos_emb,
         )
-                 
+
         # Residual connection.
         if self.config.apply_residual_connection_post_layernorm:
             residual = layernorm_output
         else:
             residual = hidden_states
-        
-        # falcon specific 
+
+        # falcon specific
         if self.new_decoder_architecture:
             mlp_ln_output = self.mlp_layernorm(hidden_states)
 
@@ -169,9 +167,7 @@ def forward(
 
         # bias_dropout_add fusion returning fp32 instead of bf16
         with self.bias_dropout_add_exec_handler():
-            layernorm_input = bias_dropout_add_func(
-                attention_output_with_bias, residual, self.config.hidden_dropout
-            )
+            layernorm_input = bias_dropout_add_func(attention_output_with_bias, residual, self.config.hidden_dropout)
 
         # falcon specific
         if not self.new_decoder_architecture:
@@ -179,7 +175,9 @@ def forward(
                 layernorm_output = input_mlp_ln
             else:
                 layernorm_output = self.post_self_attn_layernorm(layernorm_input)
-                residual = layernorm_input if not self.config.apply_residual_connection_post_layernorm else layernorm_output
+                residual = (
+                    layernorm_input if not self.config.apply_residual_connection_post_layernorm else layernorm_output
+                )
         else:
             layernorm_output = mlp_ln_output
 
@@ -189,11 +187,9 @@ def forward(
         # falcon specific:
         if self.new_decoder_architecture or self.parallel_attention:
             mlp_output_with_bias = mlp_output_with_bias + attention_output_with_bias
-            
+
         with self.bias_dropout_add_exec_handler():
-            output = bias_dropout_add_func(
-                mlp_output_with_bias, residual, self.config.hidden_dropout
-            )
+            output = bias_dropout_add_func(mlp_output_with_bias, residual, self.config.hidden_dropout)
 
         # Jit compiled function creates 'view' tensor. This tensor
         # potentially gets saved in the MPU checkpoint function context,
@@ -201,9 +197,7 @@ def forward(
         # won't result in memory savings (like the data loader, or
         # p2p_communication), it serves to document the origin of this
         # 'view' tensor.
-        output = make_viewless_tensor(
-            inp=output, requires_grad=output.requires_grad, keep_graph=True
-        )
+        output = make_viewless_tensor(inp=output, requires_grad=output.requires_grad, keep_graph=True)
 
         return output
 
@@ -245,18 +239,13 @@ def sharded_state_dict(self, prefix=''):
                 replica_id = parallel_state.get_data_parallel_rank()
             else:
                 replica_id = (
-                    parallel_state.get_data_parallel_rank()
-                    * parallel_state.get_data_parallel_world_size()
+                    parallel_state.get_data_parallel_rank() * parallel_state.get_data_parallel_world_size()
                     + parallel_state.get_tensor_model_parallel_rank()
                 )
 
             if layer_name.endswith('._extra_state'):
                 sharded_state_dict[layer_key] = ShardedObject(
-                    f'{prefix}{layer_name}',
-                    tensor,
-                    (num_layers,),
-                    (global_layer_offset,),
-                    replica_id,
+                    f'{prefix}{layer_name}', tensor, (num_layers,), (global_layer_offset,), replica_id,
                 )
 
             else:
diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index aff5901b3f88..d3de7e128782 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -197,7 +197,7 @@ def load_nemo_config(args):
     nemo_config.transformer_engine = True
     nemo_config.bias_activation_fusion = False
     nemo_config.bias_dropout_add_fusion = False
-    
+
     base = 128
     while falcon_config.vocab_size % base != 0:
         base //= 2
@@ -308,8 +308,14 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
         add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2')
 
         if falcon_config.new_decoder_architecture:
-            add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True)
-            add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True)
+            add_weight_and_possible_bias(
+                f'{prefix}.ln_attn',
+                f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm',
+                is_layernorm=True,
+            )
+            add_weight_and_possible_bias(
+                f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True
+            )
         else:
             add_weight_and_possible_bias(
                 f'{prefix}.input_layernorm',
@@ -340,7 +346,7 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
 
     del model
 
-    #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
+    # model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
     model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer)
 
     model._save_restore_connector = NLPSaveRestoreConnector()

From 908004e0d81e47ceabe2e7319bbf299167e78407 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Thu, 21 Sep 2023 21:59:24 +0000
Subject: [PATCH 09/69] add DDP

---
 .../convert_hf_falcon_to_nemo.py                    | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index d3de7e128782..6dfada1c3580 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -46,6 +46,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
     MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
     NLPSaveRestoreConnector,
     PipelineMixedPrecisionPlugin,
 )
@@ -60,6 +61,7 @@
 # [Y] trust remote code add
 # [Y] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA)
 # [Y] When bias_gelu_fusion is True, add_bias_linear must also be True. error
+# [Y] update save_to and restore_from for dist checkpointing
 # [ ] remove unnecessary comments and codes.
 
 
@@ -178,8 +180,8 @@ def load_nemo_config(args):
     nemo_config.tokenizer = tokenizer_dict
     ##############################################
     # TODO: need refactor Mcore to support parallel attn and 40b/180b model arch
-    # nemo_config.new_decoder_architecture = falcon_config['new_decoder_architecture'] #bool, if True, always use parallel attn
-    # nemo_config.parallel_attention = falcon_config['parallel_attn']
+    nemo_config.new_decoder_architecture = falcon_config.new_decoder_architecture #bool, if True, always use parallel attn
+    nemo_config.parallel_attention = falcon_config.parallel_attn
     ###############################################
 
     nemo_config.num_query_groups = (
@@ -254,8 +256,8 @@ def convert(args):
 
     dtype = determine_dtype(precision)
     nemo_config.precision = precision
-    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision)
-
+    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy())
+    
     hidden_size = falcon_config.hidden_size
     head_num = falcon_config.num_attention_heads
     head_size = hidden_size // head_num
@@ -346,7 +348,6 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
 
     del model
 
-    # model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
     model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer)
 
     model._save_restore_connector = NLPSaveRestoreConnector()
@@ -362,6 +363,6 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
 
 
 if __name__ == '__main__':
-    setup_logging()
+    #setup_logging()
     args = get_args()
     convert(args)

From c69d577c46ec6004435234398bc2f97ca3667db1 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Sat, 23 Sep 2023 02:25:16 +0000
Subject: [PATCH 10/69] fix state dict based on spec system

---
 .../convert_hf_falcon_to_nemo.py              | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index 6dfada1c3580..c7c828e99f37 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -65,7 +65,7 @@
 # [ ] remove unnecessary comments and codes.
 
 
-def setup_logging(log_file="test_log.txt"):
+def setup_logging(log_file="test.txt"):
     logging.basicConfig(
         filename=log_file,
         level=logging.DEBUG,
@@ -93,11 +93,13 @@ def get_args():
 
 def load_model(cls, checkpoint, strict, **kwargs):
     try:
+        logging.debug(f'kwargs are, {kwargs}')
         if 'cfg' in kwargs:
             model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs)
         else:
             model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs)
             for name, module in model.named_parameters():
+                logging.debug(f'model state dict name, {name}')
                 if name in checkpoint['state_dict']:
                     module.data = checkpoint['state_dict'][name]
                     checkpoint['state_dict'].pop(name)
@@ -199,6 +201,7 @@ def load_nemo_config(args):
     nemo_config.transformer_engine = True
     nemo_config.bias_activation_fusion = False
     nemo_config.bias_dropout_add_fusion = False
+    nemo_config.share_embeddings_and_output_weights = False
 
     base = 128
     while falcon_config.vocab_size % base != 0:
@@ -240,6 +243,8 @@ def convert(args):
     logging.debug(f"initial falcon_config, {falcon_config}")
 
     nemo_config = load_nemo_config(args)
+    # debug
+    logging.debug(f"initial nemo_config, {nemo_config}")
     precision = determine_precision(args)
 
     plugins = []
@@ -280,19 +285,16 @@ def convert(args):
     checkpoint = OrderedDict()
     checkpoint['state_dict'] = OrderedDict()
 
-    def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias, is_layernorm=False):
+    def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias):
         source_name = f"{source_prefix}.{weight_or_bias}"
         if source_name in model.state_dict():
-            if is_layernorm:
-                target_name = f"{target_prefix}_{weight_or_bias}"
-            else:
-                target_name = f"{target_prefix}.{weight_or_bias}"
+            target_name = f"{target_prefix}.{weight_or_bias}"
             checkpoint['state_dict'][target_name] = param_to_weights(model.state_dict()[source_name])
 
-    def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=False):
-        add_to_checkpoint(source_prefix, target_prefix, 'weight', is_layernorm)
+    def add_weight_and_possible_bias(source_prefix, target_prefix):
+        add_to_checkpoint(source_prefix, target_prefix, 'weight')
         if f"{source_prefix}.bias" in model.state_dict():
-            add_to_checkpoint(source_prefix, target_prefix, 'bias', is_layernorm)
+            add_to_checkpoint(source_prefix, target_prefix, 'bias')
 
     add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight')
 
@@ -312,23 +314,21 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
         if falcon_config.new_decoder_architecture:
             add_weight_and_possible_bias(
                 f'{prefix}.ln_attn',
-                f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm',
-                is_layernorm=True,
+                f'model.decoder.layers.{l}.input_layernorm',
             )
             add_weight_and_possible_bias(
-                f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True
+                f'{prefix}.ln_mlp', 
+                f'model.decoder.layers.{l}.pre_mlp_layernorm', 
             )
         else:
             add_weight_and_possible_bias(
                 f'{prefix}.input_layernorm',
-                f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm',
-                is_layernorm=True,
+                f'model.decoder.layers.{l}.input_layernorm',
             )
             if not falcon_config.parallel_attn:
                 add_weight_and_possible_bias(
                     f'{prefix}.post_attention_layernorm',
-                    f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm',
-                    is_layernorm=True,
+                    f'model.decoder.layers.{l}.post_self_attn_layernorm',
                 )
 
         print(f"done layer {l}")
@@ -348,6 +348,7 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
 
     del model
 
+    #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
     model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer)
 
     model._save_restore_connector = NLPSaveRestoreConnector()
@@ -358,11 +359,12 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals
     # We make sure that the tokenizer can be instantiated later regardless of args.input
     model.cfg.tokenizer.update(type=args.tokenizer_type)
     # save model
+    
     model.save_to(args.out_file)
     logging.info(f'NeMo model saved to: {args.out_file}')
 
 
 if __name__ == '__main__':
-    #setup_logging()
+    setup_logging()
     args = get_args()
     convert(args)

From 0610e19fae1672b950f3501baf5d44eebba4ecc6 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Sun, 24 Sep 2023 21:00:33 +0000
Subject: [PATCH 11/69] fix state dict based on change in layers, fix amp O2

---
 .../convert_hf_falcon_to_nemo.py              | 41 +++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index c7c828e99f37..f7995d471e24 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -65,7 +65,7 @@
 # [ ] remove unnecessary comments and codes.
 
 
-def setup_logging(log_file="test.txt"):
+def setup_logging(log_file="test.log"):
     logging.basicConfig(
         filename=log_file,
         level=logging.DEBUG,
@@ -93,22 +93,35 @@ def get_args():
 
 def load_model(cls, checkpoint, strict, **kwargs):
     try:
-        logging.debug(f'kwargs are, {kwargs}')
         if 'cfg' in kwargs:
             model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs)
         else:
             model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs)
             for name, module in model.named_parameters():
-                logging.debug(f'model state dict name, {name}')
                 if name in checkpoint['state_dict']:
                     module.data = checkpoint['state_dict'][name]
                     checkpoint['state_dict'].pop(name)
                 else:
-                    logging.info(f"Unexpected key: {name} not in checkpoint but in model.")
+                    print(f"Unexpected key: {name} not in checkpoint but in model.")
+
+            for name, buffer in model.named_buffers():
+                if name in checkpoint['state_dict']:
+                    buffer.data = checkpoint['state_dict'][name]
+                    checkpoint['state_dict'].pop(name)
+
             if len(checkpoint['state_dict'].keys()) != 0:
                 raise RuntimeError(
                     f"Additional keys: {checkpoint['state_dict'].keys()} in checkpoint but not in model."
                 )
+
+            # register the artifacts
+            cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY]
+            if cfg.tokenizer.model is not None:
+                model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model)
+            if cfg.tokenizer.vocab_file is not None:
+                model.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file)
+            if cfg.tokenizer.merge_file is not None:
+                model.register_artifact("tokenizer.merge_file", cfg.tokenizer.merge_file)
     finally:
         cls._set_model_restore_state(is_being_restored=False)
     return model
@@ -180,11 +193,9 @@ def load_nemo_config(args):
     }
 
     nemo_config.tokenizer = tokenizer_dict
-    ##############################################
-    # TODO: need refactor Mcore to support parallel attn and 40b/180b model arch
+
     nemo_config.new_decoder_architecture = falcon_config.new_decoder_architecture #bool, if True, always use parallel attn
     nemo_config.parallel_attention = falcon_config.parallel_attn
-    ###############################################
 
     nemo_config.num_query_groups = (
         falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None
@@ -340,13 +351,15 @@ def add_weight_and_possible_bias(source_prefix, target_prefix):
     add_to_checkpoint('lm_head', 'model.output_layer', 'weight')
 
     checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
-    logging.debug(f'final checkpoint, {checkpoint}')
-
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    logging.info(f'Weights loaded. Total time: {t}')
+    #logging.debug(f'final checkpoint, {checkpoint}')
 
     del model
+    
+    # state dict name for megatron_amp_O2 is different
+    if nemo_config.get('megatron_amp_O2', False):
+        keys = list(checkpoint['state_dict'].keys())
+        for key in keys:
+            checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
 
     #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
     model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer)
@@ -362,6 +375,10 @@ def add_weight_and_possible_bias(source_prefix, target_prefix):
     
     model.save_to(args.out_file)
     logging.info(f'NeMo model saved to: {args.out_file}')
+    
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logging.info(f'Weights loaded and saved. Total time: {t}')
 
 
 if __name__ == '__main__':

From a8684d04a7664fc406c9685f3f6acc8309d6825b Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Sun, 24 Sep 2023 22:29:41 +0000
Subject: [PATCH 12/69] add falcon spec system support

---
 .../language_modeling/megatron/__init__.py    |   2 +-
 .../megatron/spec_falcon/__init__.py          |   1 +
 .../megatron/spec_falcon/falcon_gpt_model.py  | 313 ++++++++++++++++++
 .../megatron/spec_falcon/falcon_spec.py       |  43 +++
 .../spec_falcon/spec_falcon_decoder_block.py  | 290 ++++++++++++++++
 .../spec_falcon/spec_falcon_decoder_layer.py  | 299 +++++++++++++++++
 .../language_modeling/megatron_gpt_model.py   |  68 +++-
 7 files changed, 1010 insertions(+), 6 deletions(-)
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
index 5dec9388528e..b024f0b061c3 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
@@ -15,7 +15,7 @@
 # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel
 
 try:
-    from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_gpt_model import FalconGPTModel
+    from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_gpt_model import FalconGPTModel
     from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 
     HAVE_MEGATRON_CORE = True
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py
new file mode 100644
index 000000000000..46da18a40345
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py
@@ -0,0 +1 @@
+from .falcon_gpt_model import FalconGPTModel
\ No newline at end of file
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py
new file mode 100644
index 000000000000..67b93283b0a4
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# just copy paste here, need work
+import logging
+from typing import Literal, Optional
+
+import torch
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.gpt.gpt_embedding import GPTEmbedding
+from megatron.core.transformer.enums import AttnMaskType, ModelType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint
+from torch import Tensor
+
+# from megatron.core.transformer.transformer_block import TransformerBlock
+from .spec_falcon_decoder_block import FalconTransformerBlock
+
+
+class FalconGPTModel(MegatronModule):
+    """Transformer language model.
+
+    Arguments:
+        config (TransformerConfig): transformer config
+
+        vocab_size (int): vocabulary size
+
+        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
+
+        pre_process (bool): Include embedding layer (used with pipeline parallelism)
+        post_process (bool): Include an output layer (used with pipeline parallelism)
+
+        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
+
+        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are
+            shared. Defaults to False.
+
+        position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope'].
+            Defaults is 'learned_absolute'.
+
+        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
+            Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
+
+        seq_len_interpolation_factor (float): scale of linearly interpolating RoPE for longer sequences.
+            The value must be a float larger than 1.0. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        transformer_layer_spec: ModuleSpec,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
+        rotary_percent: float = 1.0,
+        seq_len_interpolation_factor: Optional[float] = None,
+    ):
+        super(FalconGPTModel, self).__init__(config=config)
+        
+        self.config: TransformerConfig = config
+        self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+
+        # megatron core pipelining currently depends on model type
+        # TODO: remove this dependency ?
+        self.model_type = ModelType.encoder_or_decoder
+
+        # Embeddings.
+        if self.pre_process:
+            self.embedding = GPTEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                add_position_embedding=(self.position_embedding_type == 'learned_absolute'),
+            )
+
+        # Rotary Position Embeddings
+        if self.position_embedding_type == 'rope':
+            rotary_dim = self.config.kv_channels
+            if rotary_percent < 1.0:
+                rotary_dim = int(rotary_dim * rotary_percent)
+
+            self.rotary_pos_emb = RotaryEmbedding(rotary_dim, seq_len_interpolation_factor)
+        else:
+            self.rotary_pos_emb = None
+
+        # Transformer.
+        self.decoder = FalconTransformerBlock(
+            config=self.config,
+            transformer_layer_spec=self.transformer_layer_spec,
+            self_attn_mask_type=AttnMaskType.causal,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+
+        # Output
+        if post_process:
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process
+                and self.share_embeddings_and_output_weights,
+            )
+
+        if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
+            self.initialize_last_stage_with_word_embeddings()
+
+    def set_input_tensor(self, input_tensor):
+        """ See megatron.model.transformer.set_input_tensor()"""
+
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt'
+        self.decoder.set_input_tensor(input_tensor[0])
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        attention_mask: Tensor,
+        decoder_input: Tensor = None,
+        labels: Tensor = None,
+        inference_params=None,
+    ):
+        # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
+        # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
+
+        # Decoder embedding.
+        if decoder_input is not None:
+            pass
+        elif self.pre_process:
+            decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
+        else:
+            # intermediate stage of pipeline
+            # decoder will get hidden_states from encoder.input_tensor
+            decoder_input = None
+
+        # Rotary positional embeddings
+        rotary_pos_emb = None
+        if self.rotary_pos_emb is not None:
+            if inference_params is not None:
+                rotary_seq_len = inference_params.max_sequence_length
+            else:
+                if self.decoder.input_tensor is not None:
+                    rotary_seq_len = self.decoder.input_tensor.size(0)
+                else:
+                    rotary_seq_len = decoder_input.size(0)
+
+                # Decoder input is split along sequence dimension, but RoPE is applied in tensor parallel region
+                if self.config.sequence_parallel:
+                    rotary_seq_len *= self.config.tensor_model_parallel_size
+
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        # Run decoder.
+        hidden_states = self.decoder(
+            hidden_states=decoder_input,
+            attention_mask=attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+
+        if not self.post_process:
+            return hidden_states
+
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+        logits, _ = self.output_layer(hidden_states, weight=output_weight)
+
+        if labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous()
+
+        # [b s] => [s b]
+        labels = labels.transpose(0, 1).contiguous()
+        loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels)
+
+        # [s b] => [b, s]
+        loss = loss.transpose(0, 1).contiguous()
+        return loss
+
+    def shared_embedding_or_output_weight(self):
+        if self.pre_process:
+            return self.embedding.word_embeddings.weight
+        elif self.post_process:
+            return self.output_layer.weight
+        return None
+
+    def initialize_last_stage_with_word_embeddings(self):
+
+        # This function just initializes the word embeddings in the final stage
+        # when we are using pipeline parallelism and sharing word
+        # embeddings. Nothing to do if we aren't sharing weights or aren't using
+        # pipeline parallelism.
+        if not self.share_embeddings_and_output_weights or (self.pre_process and self.post_process):
+            return
+
+        if self.post_process and not self.pre_process:
+            assert not parallel_state.is_pipeline_first_stage()
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.output_layer.weight.data.fill_(0)
+            self.output_layer.weight.shared = True
+
+        # Parameters are shared between the word embeddings layers, and the
+        # heads at the end of the model. In a pipelined setup with more than
+        # one stage, the initial embedding layer and the head are on different
+        # workers, so we do the following:
+        # 1. Create a second copy of word_embeddings on the last stage, with
+        #    initial parameters of 0.0.
+        # 2. Do an all-reduce between the first and last stage to ensure that
+        #    the two copies of word_embeddings start off with the same
+        #    parameter values.
+        # 3. In the training loop, before an all-reduce between the grads of
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
+
+        # Ensure that first and last stages have the same initial parameter
+        # values.
+        if torch.distributed.is_initialized():
+            if parallel_state.is_rank_in_embedding_group():
+                weight = self.shared_embedding_or_output_weight()
+                torch.distributed.all_reduce(
+                    weight.data, group=parallel_state.get_embedding_group()
+                )
+
+        elif not getattr(GPTModel, "embedding_warning_printed", False):
+            logging.getLogger(__name__).warning(
+                "Distributed processes aren't initialized, so the output layer "
+                "is not initialized with weights from the word embeddings. "
+                "If you are just manipulating a model this is fine, but "
+                "this needs to be handled manually. If you are training "
+                "something is definitely wrong."
+            )
+            GPTModel.embedding_warning_printed = True
+
+    def sharded_state_dict(self, prefix=''):
+        sharded_state_dict = {}
+
+        if self.pre_process:
+            embedding_prefix = f'{prefix}embedding.'
+            embedding_sharded_state_dict = self.embedding.sharded_state_dict(
+                prefix=embedding_prefix
+            )
+            sharded_state_dict.update(embedding_sharded_state_dict)
+
+        decoder_prefix = f'{prefix}decoder.'
+        decoder_sharded_state_dict = self.decoder.sharded_state_dict(prefix=decoder_prefix)
+        sharded_state_dict.update(decoder_sharded_state_dict)
+
+        if self.post_process:
+            output_layer_prefix = f'{prefix}output_layer.'
+            output_layer_key = f'{output_layer_prefix}weight'
+            if self.share_embeddings_and_output_weights:
+                if not self.pre_process:
+                    # when sharing embeddings with last stage, we need to use the weights from the first stage
+                    # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight
+                    tensor = self.shared_embedding_or_output_weight()
+                    first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
+                    dp_rank = parallel_state.get_data_parallel_rank()
+                    dp_size = parallel_state.get_data_parallel_world_size()
+                    last_stage_word_emb_replica_id = (
+                        dp_rank + dp_size
+                    )  # copy of first stage embedding
+
+                    sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
+                        tensor=tensor,
+                        key=first_stage_word_emb_key,
+                        replica_id=last_stage_word_emb_replica_id,
+                        allow_shape_mismatch=True,
+                    )
+
+                    sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
+
+            else:
+                output_layer_state_dict = self.output_layer.state_dict(
+                    prefix=output_layer_prefix, keep_vars=True
+                )
+                output_layer_tensor = output_layer_state_dict[output_layer_key]
+                # independent output layer
+                sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
+                    tensor=output_layer_tensor,
+                    key=output_layer_key,
+                    replica_id=parallel_state.get_data_parallel_rank(),
+                    allow_shape_mismatch=True,
+                )
+
+                sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
+
+        return sharded_state_dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py
new file mode 100644
index 000000000000..70c90e520937
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py
@@ -0,0 +1,43 @@
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEDotProductAttention,
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+    TENorm,
+    TEColumnParallelLinear,
+)
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from .spec_falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
+
+# Use this spec for an implementation using modules in TE
+falcon_layer_spec = ModuleSpec(
+    module=FalconTransformerLayer,
+    submodules=FalconTransformerLayerSubmodules(
+        input_layernorm=TENorm,
+        self_attention=ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.causal},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=TEColumnParallelLinear,
+                dot_product_attention=TEDotProductAttention,
+                linear_proj=TERowParallelLinear,
+            ),
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        post_self_attn_layernorm=TENorm,
+        pre_mlp_layernorm=TENorm,
+        mlp=ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(
+                linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,
+            ),
+        ),
+        mlp_bda=get_bias_dropout_add,
+    ),
+)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py
new file mode 100644
index 000000000000..8a009c39b59e
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import re
+from contextlib import nullcontext
+
+import torch
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+from megatron.core.transformer.custom_layers.transformer_engine import TENorm
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_config import TransformerConfig
+from .spec_falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
+from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor
+
+
+class FalconTransformerBlock(MegatronModule):
+    """Transformer class."""
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        transformer_layer_spec: ModuleSpec,
+        self_attn_mask_type=AttnMaskType.padding,
+        post_layer_norm=True,
+        pre_process=True,
+        post_process=True,
+    ):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+        self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
+
+        self.self_attn_mask_type = self_attn_mask_type
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        # required for pipeline parallel schedules
+        self.input_tensor = None
+
+        self.checkpoint_core_attention = self.config.recompute_granularity == 'selective'
+
+        self.num_layers_per_pipeline_rank = (
+            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
+        )
+
+        self._build_layers(self.transformer_layer_spec)
+
+    def _build_layers(self, transformer_layer_spec):
+        # Transformer layers.
+        # @jcasper can we improve how we deal with layer_number?
+        # currently it's only used in CoreAttention?
+        # if self.apply_query_key_layer_scaling:
+        #     coeff = self.layer_number
+        #     self.norm_factor *= coeff
+        def build_layer(layer_number):
+            layer = FalconTransformerLayer(
+                config=self.config,
+                submodules=transformer_layer_spec.submodules,
+                layer_number=layer_number,
+                self_attn_mask_type=self.self_attn_mask_type,
+            )
+            return layer
+
+        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+            # Interleaved pipeline parallelism:
+            # Number of layers in each model chunk is the number of layers in the stage,
+            # divided by the number of model chunks in a stage.
+            # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0]  [2]  [4]  [6]
+            # Stage 1: [1]  [3]  [5]  [7]
+            # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0, 1]  [4, 5]
+            # Stage 1: [2, 3]  [6, 7]
+
+            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+            num_layers_per_virtual_rank = self.num_layers_per_pipeline_rank // vp_size
+
+            num_layers_to_build = num_layers_per_virtual_rank
+
+        else:
+            # Non-interleaved pipeline parallelism:
+            # Each stage gets a contiguous set of layers.
+
+            num_layers_to_build = self.num_layers_per_pipeline_rank
+
+        # offset is implicit in TransformerLayer
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(num_layers_to_build)])
+
+        # # TODO: add back standalone_embedding_stage
+        # if self.num_layers == 0:
+        #     # When a standalone embedding stage is used (e.g.,
+        #     # args.standalone_embedding_stage == True), virtual pipeline ranks
+        #     # on pipeline rank 0 will have zero transformer layers assigned to
+        #     # them. This results in the model's input and output tensors to be
+        #     # the same, which will cause failure for certain output tensor
+        #     # optimizations (e.g., pipeline output deallocation). To remedy
+        #     # this, we assign a 'no-op' layer on these ranks, which will
+        #     # disconnect the input tensor from the output tensor.
+        #     self.num_layers = 1
+        #     self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)])
+        # else:
+        #     self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)])
+
+        if self.post_process and self.post_layer_norm:
+            # Final layer norm before output.
+            self.final_layernorm = TENorm(
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+                persist_layer_norm=self.config.persist_layer_norm,
+                sequence_parallel=self.config.sequence_parallel,
+                zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+                normalization=self.config.normalization,
+            )
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def _checkpointed_forward(self, hidden_states, attention_mask, rotary_pos_emb):
+        """Forward method with activation checkpointing."""
+
+        def custom(start, end):
+            def custom_forward(*args, **kwargs):
+                x_, *args = args
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+                    x_ = layer(x_, *args, **kwargs)
+                return x_
+
+            return custom_forward
+
+        if self.config.recompute_method == 'uniform':
+            # Uniformly divide the total number of Transformer layers and checkpoint
+            # the input activation of each divided chunk.
+            # A method to further reduce memory usage reducing checkpoints.
+            l = 0
+            while l < self.num_layers_per_pipeline_rank:
+                hidden_states = tensor_parallel.checkpoint(
+                    custom(l, l + self.config.recompute_num_layers),
+                    self.config.distribute_saved_activations,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                )
+
+                l += self.config.recompute_num_layers
+
+        elif self.config.recompute_method == 'block':
+            # Checkpoint the input activation of only a set number of individual
+            # Transformer layers and skip the rest.
+            # A method fully use the device memory removing redundant re-computation.
+            for l in range(self.num_layers_per_pipeline_rank):
+                if l < self.config.recompute_num_layers:
+                    hidden_states = tensor_parallel.checkpoint(
+                        custom(l, l + 1),
+                        self.config.distribute_saved_activations,
+                        hidden_states,
+                        attention_mask,
+                        rotary_pos_emb,
+                    )
+                else:
+                    hidden_states = custom(l, l + 1)(hidden_states, attention_mask, rotary_pos_emb)
+        else:
+            raise ValueError("Invalid activation recompute method.")
+
+        return hidden_states
+
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def forward(self, hidden_states, attention_mask, inference_params=None, rotary_pos_emb=None):
+        # hidden_states (float): [s, b, h]
+        # attention_mask (bool): [1, 1, s, s]
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = make_viewless_tensor(
+            inp=hidden_states, requires_grad=True, keep_graph=True,
+        )
+
+        if self.config.sequence_parallel:
+            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        if self.config.fp8:
+            import transformer_engine  # To keep out TE dependency when not training in fp8
+
+            if self.config.fp8 == "e4m3":
+                fp8_format = transformer_engine.common.recipe.Format.E4M3
+            elif self.config.fp8 == "hybrid":
+                fp8_format = transformer_engine.common.recipe.Format.HYBRID
+            else:
+                raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
+
+            fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
+                margin=self.config.fp8_margin,
+                interval=self.config.fp8_interval,
+                fp8_format=fp8_format,
+                amax_compute_algo=self.config.fp8_amax_compute_algo,
+                amax_history_len=self.config.fp8_amax_history_len,
+                override_linear_precision=(False, False, not self.config.fp8_wgrad),
+            )
+            fp8_group = None
+            if parallel_state.model_parallel_is_initialized():
+                fp8_group = parallel_state.get_amax_reduction_group()
+            fp8_context = transformer_engine.pytorch.fp8_autocast(
+                enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group
+            )
+        else:
+            fp8_context = nullcontext()
+
+        with rng_context and fp8_context:
+            # Forward pass.
+            if self.config.recompute_granularity == 'full':
+                hidden_states = self._checkpointed_forward(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    rotary_pos_emb=rotary_pos_emb,
+                )
+            else:
+                for layer in self.layers:
+                    hidden_states = layer(
+                        hidden_states=hidden_states,
+                        attention_mask=attention_mask,
+                        rotary_pos_emb=rotary_pos_emb,
+                        inference_params=inference_params,
+                    )
+
+        # Final layer norm.
+        if self.post_process and self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+    def sharded_state_dict(self, prefix=''):
+
+        sharded_state_dict = {}
+
+        layer_prefix = f'{prefix}layers.'
+        for layer in self.layers:
+            sharded_state_dict.update(layer.sharded_state_dict(prefix=layer_prefix))
+
+        if self.post_process and self.post_layer_norm:
+            state_dict = self.state_dict(keep_vars=True)
+
+            tensor = state_dict['final_layernorm.weight']
+            layer_name = f'{prefix}final_layernorm.weight'
+            sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name)
+
+            # RMSNorm doesn't have bias.
+            if 'final_layernorm.bias' in state_dict.keys():
+                tensor = state_dict['final_layernorm.bias']
+                layer_name = f'{prefix}final_layernorm.bias'
+                sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(
+                    tensor, layer_name
+                )
+
+        return sharded_state_dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py
new file mode 100644
index 000000000000..68f417b04409
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py
@@ -0,0 +1,299 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.transformer.attention import SelfAttentionSubmodules
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_viewless_tensor
+
+""" We use the following notation throughout this file:
+     h: hidden size
+     n: number of attention heads
+     p: number of model parallel partitions
+     np: n/p
+     hp: h/p
+     hn: h/n
+     b: batch size
+     s: sequence length
+     l: number of layers
+    Transformer takes input of size [s, b, h] and returns a
+    tensor of the same size. We use the following arguments:
+        hyperparameters: transformer hyperparameters
+"""
+
+@dataclass
+class FalconTransformerLayerSubmodules:
+    input_layernorm: Union[ModuleSpec, type] = IdentityOp
+    self_attention: Union[ModuleSpec, type] = IdentityOp
+    self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp
+
+    post_self_attn_layernorm: Union[ModuleSpec, type] = IdentityOp
+    
+    pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp
+    mlp: Union[ModuleSpec, type] = IdentityOp
+    mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp
+
+
+class FalconTransformerLayer(MegatronModule):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+        
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig, # should come from FalconTransformerConfig class
+        submodules: FalconTransformerLayerSubmodules,
+        layer_number: int = 1,
+        self_attn_mask_type=AttnMaskType.padding,
+    ):
+        super().__init__(config=config)
+        self.config: TransformerConfig = config
+        
+
+        self.layer_number = layer_number + self._get_layer_offset()
+
+        self.self_attn_mask_type = self_attn_mask_type
+
+        self.new_decoder_architecture = self.config.new_decoder_architecture
+
+        self.parallel_attention = self.config.parallel_attention
+
+
+        ## [Module 1: Input Layernorm] Optional Layernorm on the input data
+        # TODO: add pytorch only layernorm
+        self.input_layernorm = build_module(
+            submodules.input_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+            persist_layer_norm=self.config.persist_layer_norm,
+            sequence_parallel=self.config.sequence_parallel,
+            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+            normalization=self.config.normalization,
+        )
+
+        ## [Module 2: SelfAttention]
+        self.self_attention = build_module(
+            submodules.self_attention, config=self.config, layer_number=layer_number,
+        )
+
+        ## [Module 3: BiasDropoutFusion] Optional
+        self.self_attn_bda = build_module(submodules.self_attn_bda)
+
+        ## [Module 4: Post SelfAttention] Optional Layernorm after self-attn
+        if self.new_decoder_architecture or self.parallel_attention:
+            self.post_self_attn_layernorm = None
+        else:
+            self.post_self_attn_layernorm = build_module(
+                submodules.post_self_attn_layernorm,
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+                persist_layer_norm=self.config.persist_layer_norm,
+                sequence_parallel=self.config.sequence_parallel,
+                zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+                normalization=self.config.normalization,
+            )
+
+        ## [Module 5: pre mlp layernorm] Optional Layernorm before MLP, used in Falcon's new decoder architecture
+        self.pre_mlp_layernorm = build_module(
+            submodules.pre_mlp_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+            persist_layer_norm=self.config.persist_layer_norm,
+            sequence_parallel=self.config.sequence_parallel,
+            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+            normalization=self.config.normalization,
+        ) if self.new_decoder_architecture else None
+
+        ## [Module 6: MLP block]
+        self.mlp = build_module(submodules.mlp, config=self.config)
+
+        ## [Module 7: BiasDropoutFusion] Optional
+        self.mlp_bda = build_module(submodules.mlp_bda)
+        
+        # @jcasper how should we handle nvfuser?
+        # Set bias+dropout+add fusion grad_enable execution handler.
+        # TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        # TORCH_MINOR = int(torch.__version__.split('.')[1])
+        # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
+        # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
+        self.bias_dropout_add_exec_handler = torch.enable_grad
+
+    def _get_layer_offset(self):
+
+        pipeline_rank = parallel_state.get_pipeline_model_parallel_rank()
+
+        num_layers_per_pipeline_rank = (
+            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
+        )
+
+        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+            vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
+            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+            
+            total_num_layers = self.config.num_layers
+            num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
+            total_virtual_chunks = total_num_layers // vp_size
+            offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank)
+
+        else:
+            # Each stage gets a contiguous set of layers.
+            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+                offset = pipeline_rank * num_layers_per_pipeline_rank
+            else:
+                offset = 0
+                
+        return offset
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_output=None,
+        enc_dec_attn_mask=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+    ):
+        # hidden_states: [s, b, h]
+        
+        # Residual connection.
+        residual = hidden_states
+        
+        if self.new_decoder_architecture:
+            mlp_ln_output = self.pre_mlp_layernorm(hidden_states)
+        
+        # Optional Input Layer norm
+        input_layernorm_output = self.input_layernorm(hidden_states)
+
+        input_mlp_ln = input_layernorm_output
+
+        # Self attention.
+        attention_output_with_bias = self.self_attention(
+            input_layernorm_output,
+            attention_mask=attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+        
+        # TODO: could we move `bias_dropout_add_exec_handler` itself
+        # inside the module provided in the `bias_dropout_add_spec` module?
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)(
+                attention_output_with_bias, residual, self.config.hidden_dropout
+            )
+
+        if not self.new_decoder_architecture:
+            if self.parallel_attention:
+                layernorm_output = input_mlp_ln
+            else:
+                residual = hidden_states
+                layernorm_output = self.post_self_attn_layernorm(hidden_states)
+
+        else:
+            layernorm_output = mlp_ln_output
+
+        mlp_output_with_bias = self.mlp(layernorm_output)
+        
+        # falcon specific:
+        if self.new_decoder_architecture or self.parallel_attention:
+            mlp_output= mlp_output_with_bias[0]
+            attn_output= attention_output_with_bias[0]
+            mlp_output_without_bias = mlp_output + attn_output
+            mlp_output_with_bias = (mlp_output_without_bias, None)
+
+        # TODO: could we move `bias_dropout_add_exec_handler` itself
+        # inside the module provided in the `bias_dropout_add_spec` module?
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)(
+                mlp_output_with_bias, residual, self.config.hidden_dropout
+            )
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(
+            inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True
+        )
+
+        return output
+
+    def sharded_state_dict(self, prefix=''):
+
+        # state_dict = self.state_dict(prefix=prefix, keep_vars=True)
+        state_dict = self.state_dict(keep_vars=True)
+
+        tensor_parallel_layers_axis_map = {
+            'self_attention.linear_qkv.weight': 0,
+            'self_attention.linear_qkv.bias': 0,
+            'self_attention.linear_proj.weight': 1,
+            'mlp.linear_fc1.weight': 0,
+            'mlp.linear_fc1.bias': 0,
+            'mlp.linear_fc2.weight': 1,
+        }
+
+        offset = self._get_layer_offset()
+        num_layers = self.config.num_layers
+
+        sharded_state_dict = {}
+
+        for layer_name in state_dict.keys():
+            tensor = state_dict[layer_name]
+            global_layer_offset = self.layer_number - 1  # self.layer_number starts at 1
+            layer_key = f'{prefix}{global_layer_offset - offset}.{layer_name}'  # module list index in TransformerBlock
+            sharded_offsets = [(0, global_layer_offset, num_layers)]  # PP sharding
+
+            if layer_name in tensor_parallel_layers_axis_map:
+                tp_axis = tensor_parallel_layers_axis_map[layer_name]
+                # TP sharding
+                sharded_offsets.append(
+                    [
+                        tp_axis + 1,  # +1 for PP dimension
+                        parallel_state.get_tensor_model_parallel_rank(),
+                        parallel_state.get_tensor_model_parallel_world_size(),
+                    ]
+                )
+                replica_id = parallel_state.get_data_parallel_rank()
+            else:
+                replica_id = (
+                    parallel_state.get_data_parallel_rank()
+                    * parallel_state.get_data_parallel_world_size()
+                    + parallel_state.get_tensor_model_parallel_rank()
+                )
+
+            if layer_name.endswith('._extra_state'):
+                sharded_state_dict[layer_key] = ShardedObject(
+                    f'{prefix}{layer_name}',
+                    tensor,
+                    (num_layers,),
+                    (global_layer_offset,),
+                    replica_id,
+                )
+
+            else:
+                sharded_state_dict[layer_key] = ShardedTensor.from_rank_offsets(
+                    f'{prefix}{layer_name}',
+                    tensor,
+                    *sharded_offsets,
+                    replica_id=replica_id,
+                    prepend_axis_num=1,  # for PP sharding
+                )
+
+        return sharded_state_dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 45586bffcdce..cf640a813dae 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -15,7 +15,7 @@
 import itertools
 import queue
 import warnings
-from dataclasses import fields
+from dataclasses import fields, dataclass
 from functools import partial
 from typing import Any, Dict, Iterator, List, Optional, Union
 
@@ -78,6 +78,8 @@
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import init_method_normal, scaled_init_method_normal
+    from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec
+    from megatron.core.transformer.spec_utils import import_module
 
     # TODO @tmoon: Use once available in Megatron-LM
     # from megatron.core.pipeline_parallel.schedules import DataIteratorList
@@ -99,6 +101,25 @@
 except (ImportError, ModuleNotFoundError):
     HAVE_TE = False
 
+def import_falcon_gpt_model():
+    """Conditionally import FalconGPTModel.
+    """
+    try:
+        #from megatron.core.models.falcon.falcon_gpt_model import FalconGPTModel
+        from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_gpt_model import FalconGPTModel
+        from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_spec import falcon_layer_spec
+        return FalconGPTModel, falcon_layer_spec
+    except (ImportError, ModuleNotFoundError):
+        raise ImportError("Failed to import FalconGPTModel. Please ensure the necessary dependencies are installed.")
+
+@dataclass      
+class FalconTransformerConfig(TransformerConfig):
+    """
+    Transformer Config for Falcon Variants
+    """
+    
+    new_decoder_architecture: bool = False
+    parallel_attention: bool = False
 
 class MegatronGPTExportableModel(torch.nn.Module, Exportable):
     """
@@ -213,7 +234,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)
 
         self.mcore_gpt = cfg.get('mcore_gpt', False)
-
+        # Falcon specific args
+        self.new_decoder_architecture = cfg.get('new_decoder_architecture', False)
+        self.parallel_attention = cfg.get('parallel_attention', False)
+        
         self.rampup_batch_size = self.cfg.get('rampup_batch_size', None)
         if self.rampup_batch_size:
             self.prev_consumed_samples = 0
@@ -301,9 +325,32 @@ def get_inference_config(self):
 
     def model_provider_func(self, pre_process, post_process):
         """Model depends on pipeline paralellism."""
-        if self.mcore_gpt:
+        if self.mcore_gpt and (self.new_decoder_architecture or self.parallel_attention):
+            FalconGPTModel, falcon_layer_spec = import_falcon_gpt_model()
+            transformer_layer_spec = falcon_layer_spec
+            #debug
+            logging.info(f'falcon gpt config, {self.transformer_config}')
+            model = FalconGPTModel(
+                config=self.transformer_config,
+                transformer_layer_spec = transformer_layer_spec,
+                vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
+                max_sequence_length=self.cfg.get('encoder_seq_length', 512),
+                pre_process=pre_process,
+                post_process=post_process,
+                parallel_output=True,
+                share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True),
+                position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'),
+                rotary_percent=self.cfg.get('rotary_percentage', 1.0),
+                seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
+            )
+            
+            logging.info(f'model architecture is {model}')
+            
+        elif self.mcore_gpt:
+            transformer_layer_spec = gpt_layer_with_transformer_engine_spec
             model = MCoreGPTModel(
                 config=self.transformer_config,
+                transformer_layer_spec = transformer_layer_spec,
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
                 max_sequence_length=self.cfg.get('encoder_seq_length', 512),
                 pre_process=pre_process,
@@ -1482,6 +1529,10 @@ def build_transformer_config(self) -> TransformerConfig:
         gated_linear_unit = activation.endswith('glu')
         activation_func = activation_to_func(activation)
 
+        mcore_gpt = self.cfg.get('mcore_gpt', False)
+        new_decoder_architecture = self.cfg.get('new_decoder_architecture', False)
+        parallel_attention = self.cfg.get('parallel_attention', False)
+        
         normalization = self.cfg.get('normalization', 'layernorm')
         if normalization == 'layernorm':
             normalization = 'LayerNorm'
@@ -1563,8 +1614,15 @@ def build_transformer_config(self) -> TransformerConfig:
                     f"The model: {self} does not have field.name: {field.name} in its cfg. "
                     f"Add this key to cfg or config_mapping to make to make it configurable."
                 )
-
-        transformer_config = TransformerConfig(**transformer_config_dict)
+                
+        if mcore_gpt and (new_decoder_architecture or parallel_attention):
+            transformer_config = FalconTransformerConfig(
+                **transformer_config_dict,
+                new_decoder_architecture = new_decoder_architecture,
+                parallel_attention = parallel_attention,
+            )
+        else:
+            transformer_config = TransformerConfig(**transformer_config_dict)
 
         return transformer_config
 

From 09952722b393661f1ae29536c383bc66742b2ac6 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Sun, 24 Sep 2023 22:32:54 +0000
Subject: [PATCH 13/69] remove old falcon mcore support

---
 .../megatron/falcon_mcore/falcon_gpt_model.py | 302 ------------------
 .../falcon_mcore/falcon_transformer_block.py  | 281 ----------------
 .../falcon_mcore/falcon_transformer_config.py | 263 ---------------
 .../falcon_mcore/falcon_transformer_layer.py  | 260 ---------------
 4 files changed, 1106 deletions(-)
 delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py
 delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py
 delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py
 delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py
deleted file mode 100644
index b76b3d8828ee..000000000000
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# just copy paste here, need work
-import logging
-from typing import Literal, Optional
-
-import torch
-from megatron.core import parallel_state, tensor_parallel
-from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding
-from megatron.core.models.gpt.gpt_embedding import GPTEmbedding
-from megatron.core.transformer.enums import AttnMaskType, ModelType
-from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint
-from torch import Tensor
-
-# from megatron.core.transformer.transformer_block import TransformerBlock
-from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_block import (
-    FalconTransformerBlock,
-)
-
-
-class FalconGPTModel(MegatronModule):
-    """Transformer language model.
-
-    Arguments:
-        config (TransformerConfig): transformer config
-
-        vocab_size (int): vocabulary size
-
-        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
-
-        pre_process (bool): Include embedding layer (used with pipeline parallelism)
-        post_process (bool): Include an output layer (used with pipeline parallelism)
-
-        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
-
-        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are
-            shared. Defaults to False.
-
-        position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope'].
-            Defaults is 'learned_absolute'.
-
-        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
-            Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
-
-        seq_len_interpolation_factor (float): scale of linearly interpolating RoPE for longer sequences.
-            The value must be a float larger than 1.0. Defaults to None.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        vocab_size: int,
-        max_sequence_length: int,
-        pre_process: bool = True,
-        post_process: bool = True,
-        fp16_lm_cross_entropy: bool = False,
-        parallel_output: bool = True,
-        share_embeddings_and_output_weights: bool = False,
-        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
-        rotary_percent: float = 1.0,
-        seq_len_interpolation_factor: Optional[float] = None,
-    ):
-        super(GPTModel, self).__init__(config=config)
-
-        self.config: TransformerConfig = config
-        self.vocab_size = vocab_size
-        self.max_sequence_length = max_sequence_length
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
-        self.parallel_output = parallel_output
-        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
-        self.position_embedding_type = position_embedding_type
-
-        # megatron core pipelining currently depends on model type
-        # TODO: remove this dependency ?
-        self.model_type = ModelType.encoder_or_decoder
-
-        # Embeddings.
-        if self.pre_process:
-            self.embedding = GPTEmbedding(
-                config=self.config,
-                vocab_size=self.vocab_size,
-                max_sequence_length=self.max_sequence_length,
-                add_position_embedding=(self.position_embedding_type == 'learned_absolute'),
-            )
-
-        # Rotary Position Embeddings
-        if self.position_embedding_type == 'rope':
-            rotary_dim = self.config.kv_channels
-            if rotary_percent < 1.0:
-                rotary_dim = int(rotary_dim * rotary_percent)
-
-            self.rotary_pos_emb = RotaryEmbedding(rotary_dim, seq_len_interpolation_factor)
-        else:
-            self.rotary_pos_emb = None
-
-        # Transformer.
-        self.decoder = FalconTransformerBlock(
-            config=self.config,
-            self_attn_mask_type=AttnMaskType.causal,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-        )
-
-        # Output
-        if post_process:
-            self.output_layer = tensor_parallel.ColumnParallelLinear(
-                config.hidden_size,
-                self.vocab_size,
-                config=config,
-                init_method=config.init_method,
-                bias=False,
-                skip_bias_add=False,
-                gather_output=not self.parallel_output,
-                skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights,
-            )
-
-        if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
-            self.initialize_last_stage_with_word_embeddings()
-
-    def set_input_tensor(self, input_tensor):
-        """ See megatron.model.transformer.set_input_tensor()"""
-
-        # This is usually handled in schedules.py but some inference code still
-        # gives us non-lists or None
-        if not isinstance(input_tensor, list):
-            input_tensor = [input_tensor]
-
-        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt'
-        self.decoder.set_input_tensor(input_tensor[0])
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        position_ids: Tensor,
-        attention_mask: Tensor,
-        decoder_input: Tensor = None,
-        labels: Tensor = None,
-        inference_params=None,
-    ):
-        # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
-        # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
-
-        # Decoder embedding.
-        if decoder_input is not None:
-            pass
-        elif self.pre_process:
-            decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
-        else:
-            # intermediate stage of pipeline
-            # decoder will get hidden_states from encoder.input_tensor
-            decoder_input = None
-
-        # Rotary positional embeddings
-        rotary_pos_emb = None
-        if self.rotary_pos_emb is not None:
-            if inference_params is not None:
-                rotary_seq_len = inference_params.max_sequence_length
-            else:
-                if self.decoder.input_tensor is not None:
-                    rotary_seq_len = self.decoder.input_tensor.size(0)
-                else:
-                    rotary_seq_len = decoder_input.size(0)
-
-                # Decoder input is split along sequence dimension, but RoPE is applied in tensor parallel region
-                if self.config.sequence_parallel:
-                    rotary_seq_len *= self.config.tensor_model_parallel_size
-
-            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
-
-        # Run decoder.
-        hidden_states = self.decoder(
-            hidden_states=decoder_input,
-            attention_mask=attention_mask,
-            inference_params=inference_params,
-            rotary_pos_emb=rotary_pos_emb,
-        )
-
-        if not self.post_process:
-            return hidden_states
-
-        # logits and loss
-        output_weight = None
-        if self.share_embeddings_and_output_weights:
-            output_weight = self.shared_embedding_or_output_weight()
-        logits, _ = self.output_layer(hidden_states, weight=output_weight)
-
-        if labels is None:
-            # [s b h] => [b s h]
-            return logits.transpose(0, 1).contiguous()
-
-        # [b s] => [s b]
-        labels = labels.transpose(0, 1).contiguous()
-        loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels)
-
-        # [s b] => [b, s]
-        loss = loss.transpose(0, 1).contiguous()
-        return loss
-
-    def shared_embedding_or_output_weight(self):
-        if self.pre_process:
-            return self.embedding.word_embeddings.weight
-        elif self.post_process:
-            return self.output_layer.weight
-        return None
-
-    def initialize_last_stage_with_word_embeddings(self):
-
-        # This function just initializes the word embeddings in the final stage
-        # when we are using pipeline parallelism and sharing word
-        # embeddings. Nothing to do if we aren't sharing weights or aren't using
-        # pipeline parallelism.
-        if not self.share_embeddings_and_output_weights or (self.pre_process and self.post_process):
-            return
-
-        if self.post_process and not self.pre_process:
-            assert not parallel_state.is_pipeline_first_stage()
-            # set word_embeddings weights to 0 here, then copy first
-            # stage's weights using all_reduce below.
-            self.output_layer.weight.data.fill_(0)
-            self.output_layer.weight.shared = True
-
-        # Parameters are shared between the word embeddings layers, and the
-        # heads at the end of the model. In a pipelined setup with more than
-        # one stage, the initial embedding layer and the head are on different
-        # workers, so we do the following:
-        # 1. Create a second copy of word_embeddings on the last stage, with
-        #    initial parameters of 0.0.
-        # 2. Do an all-reduce between the first and last stage to ensure that
-        #    the two copies of word_embeddings start off with the same
-        #    parameter values.
-        # 3. In the training loop, before an all-reduce between the grads of
-        #    the two word_embeddings layers to ensure that every applied weight
-        #    update is the same on both stages.
-
-        # Ensure that first and last stages have the same initial parameter
-        # values.
-        if torch.distributed.is_initialized():
-            if parallel_state.is_rank_in_embedding_group():
-                weight = self.shared_embedding_or_output_weight()
-                torch.distributed.all_reduce(weight.data, group=parallel_state.get_embedding_group())
-
-        elif not getattr(GPTModel, "embedding_warning_printed", False):
-            logging.getLogger(__name__).warning(
-                "Distributed processes aren't initialized, so the output layer "
-                "is not initialized with weights from the word embeddings. "
-                "If you are just manipulating a model this is fine, but "
-                "this needs to be handled manually. If you are training "
-                "something is definitely wrong."
-            )
-            GPTModel.embedding_warning_printed = True
-
-    def sharded_state_dict(self, prefix=''):
-        sharded_state_dict = {}
-
-        if self.pre_process:
-            embedding_prefix = f'{prefix}embedding.'
-            embedding_sharded_state_dict = self.embedding.sharded_state_dict(prefix=embedding_prefix)
-            sharded_state_dict.update(embedding_sharded_state_dict)
-
-        decoder_prefix = f'{prefix}decoder.'
-        decoder_sharded_state_dict = self.decoder.sharded_state_dict(prefix=decoder_prefix)
-        sharded_state_dict.update(decoder_sharded_state_dict)
-
-        if self.post_process:
-            output_layer_prefix = f'{prefix}output_layer.'
-            output_layer_key = f'{output_layer_prefix}weight'
-            if self.share_embeddings_and_output_weights:
-                if not self.pre_process:
-                    # when sharing embeddings with last stage, we need to use the weights from the first stage
-                    # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight
-                    tensor = self.shared_embedding_or_output_weight()
-                    first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
-                    dp_rank = parallel_state.get_data_parallel_rank()
-                    dp_size = parallel_state.get_data_parallel_world_size()
-                    last_stage_word_emb_replica_id = dp_rank + dp_size  # copy of first stage embedding
-
-                    sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
-                        tensor=tensor,
-                        key=first_stage_word_emb_key,
-                        replica_id=last_stage_word_emb_replica_id,
-                        allow_shape_mismatch=True,
-                    )
-
-                    sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
-
-            else:
-                output_layer_state_dict = self.output_layer.state_dict(prefix=output_layer_prefix, keep_vars=True)
-                output_layer_tensor = output_layer_state_dict[output_layer_key]
-                # independent output layer
-                sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
-                    tensor=output_layer_tensor,
-                    key=output_layer_key,
-                    replica_id=parallel_state.get_data_parallel_rank(),
-                    allow_shape_mismatch=True,
-                )
-
-                sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
-
-        return sharded_state_dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py
deleted file mode 100644
index d3b9ae63e4ac..000000000000
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import re
-from contextlib import nullcontext
-
-import torch
-from megatron.core import parallel_state, tensor_parallel
-from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
-from megatron.core.transformer.custom_layers.transformer_engine import TENorm
-from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor
-
-# change import FalconTransformerLayer
-from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_layer import (
-    FalconTransformerLayer,
-)
-
-
-class FalconTransformerBlock(MegatronModule):
-    """Transformer class."""
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        self_attn_mask_type=AttnMaskType.padding,
-        post_layer_norm=True,
-        pre_process=True,
-        post_process=True,
-    ):
-        super().__init__(config=config)
-
-        self.config: TransformerConfig = config
-
-        self.self_attn_mask_type = self_attn_mask_type
-        self.post_layer_norm = post_layer_norm
-        self.pre_process = pre_process
-        self.post_process = post_process
-
-        # required for pipeline parallel schedules
-        self.input_tensor = None
-
-        self.checkpoint_core_attention = self.config.recompute_granularity == 'selective'
-
-        self.num_layers_per_pipeline_rank = (
-            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
-        )
-
-        self._build_layers()
-
-    def _build_layers(self):
-        # Transformer layers.
-        # @jcasper can we improve how we deal with layer_number?
-        # currently it's only used in CoreAttention?
-        # if self.apply_query_key_layer_scaling:
-        #     coeff = self.layer_number
-        #     self.norm_factor *= coeff
-        def build_layer(layer_number):
-            layer = FalconTransformerLayer(
-                config=self.config, layer_number=layer_number, self_attn_mask_type=self.self_attn_mask_type,
-            )
-            return layer
-
-        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
-            # Interleaved pipeline parallelism:
-            # Number of layers in each model chunk is the number of layers in the stage,
-            # divided by the number of model chunks in a stage.
-            # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
-            # layers to stages like (each list is a model chunk):
-            # Stage 0: [0]  [2]  [4]  [6]
-            # Stage 1: [1]  [3]  [5]  [7]
-            # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
-            # layers to stages like (each list is a model chunk):
-            # Stage 0: [0, 1]  [4, 5]
-            # Stage 1: [2, 3]  [6, 7]
-
-            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
-
-            num_layers_per_virtual_rank = self.num_layers_per_pipeline_rank // vp_size
-
-            num_layers_to_build = num_layers_per_virtual_rank
-
-        else:
-            # Non-interleaved pipeline parallelism:
-            # Each stage gets a contiguous set of layers.
-
-            num_layers_to_build = self.num_layers_per_pipeline_rank
-
-        # offset is implicit in TransformerLayer
-        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(num_layers_to_build)])
-
-        # # TODO: add back standalone_embedding_stage
-        # if self.num_layers == 0:
-        #     # When a standalone embedding stage is used (e.g.,
-        #     # args.standalone_embedding_stage == True), virtual pipeline ranks
-        #     # on pipeline rank 0 will have zero transformer layers assigned to
-        #     # them. This results in the model's input and output tensors to be
-        #     # the same, which will cause failure for certain output tensor
-        #     # optimizations (e.g., pipeline output deallocation). To remedy
-        #     # this, we assign a 'no-op' layer on these ranks, which will
-        #     # disconnect the input tensor from the output tensor.
-        #     self.num_layers = 1
-        #     self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)])
-        # else:
-        #     self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)])
-
-        if self.post_process and self.post_layer_norm:
-            # Final layer norm before output.
-            self.final_layernorm = TENorm(
-                config=self.config,
-                hidden_size=self.config.hidden_size,
-                eps=self.config.layernorm_epsilon,
-                persist_layer_norm=self.config.persist_layer_norm,
-                sequence_parallel=self.config.sequence_parallel,
-                zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
-                normalization=self.config.normalization,
-            )
-
-    def _get_layer(self, layer_number):
-        return self.layers[layer_number]
-
-    def _checkpointed_forward(self, hidden_states, attention_mask, rotary_pos_emb):
-        """Forward method with activation checkpointing."""
-
-        def custom(start, end):
-            def custom_forward(*args, **kwargs):
-                x_, *args = args
-                for index in range(start, end):
-                    layer = self._get_layer(index)
-                    x_ = layer(x_, *args, **kwargs)
-                return x_
-
-            return custom_forward
-
-        if self.config.recompute_method == 'uniform':
-            # Uniformly divide the total number of Transformer layers and checkpoint
-            # the input activation of each divided chunk.
-            # A method to further reduce memory usage reducing checkpoints.
-            l = 0
-            while l < self.num_layers_per_pipeline_rank:
-                hidden_states = tensor_parallel.checkpoint(
-                    custom(l, l + self.config.recompute_num_layers),
-                    self.config.distribute_saved_activations,
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                )
-
-                l += self.config.recompute_num_layers
-
-        elif self.config.recompute_method == 'block':
-            # Checkpoint the input activation of only a set number of individual
-            # Transformer layers and skip the rest.
-            # A method fully use the device memory removing redundant re-computation.
-            for l in range(self.num_layers_per_pipeline_rank):
-                if l < self.config.recompute_num_layers:
-                    hidden_states = tensor_parallel.checkpoint(
-                        custom(l, l + 1),
-                        self.config.distribute_saved_activations,
-                        hidden_states,
-                        attention_mask,
-                        rotary_pos_emb,
-                    )
-                else:
-                    hidden_states = custom(l, l + 1)(hidden_states, attention_mask, rotary_pos_emb)
-        else:
-            raise ValueError("Invalid activation recompute method.")
-
-        return hidden_states
-
-    def set_input_tensor(self, input_tensor):
-        """Set input tensor to be used instead of forward()'s input.
-
-        When doing pipeline parallelism the input from the previous
-        stage comes from communication, not from the input, so the
-        model's forward_step_func won't have it. This function is thus
-        used by internal code to bypass the input provided by the
-        forward_step_func"""
-        self.input_tensor = input_tensor
-
-    def forward(self, hidden_states, attention_mask, inference_params=None, rotary_pos_emb=None):
-        # hidden_states (float): [s, b, h]
-        # attention_mask (bool): [1, 1, s, s]
-
-        if not self.pre_process:
-            # See set_input_tensor()
-            hidden_states = self.input_tensor
-
-        # Viewless tensor.
-        # - We only need to create a viewless tensor in the case of micro batch
-        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
-        #   above creates a view tensor, and '.contiguous()' is a pass-through.
-        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
-        #   the need to make it viewless.
-        #
-        #   However, we don't explicitly check mbs == 1 here because
-        #   make_viewless_tensor() has negligible overhead when its input
-        #   is already viewless.
-        #
-        # - For the 'else' case above, calling make_viewless_tensor() here is
-        #   likely redundant, since p2p_communication.py (likely originator)
-        #   already creates viewless tensors. That said, make_viewless_tensor()
-        #   is called here to be future-proof and corner-case-proof.
-        hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True,)
-
-        if self.config.sequence_parallel:
-            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
-        else:
-            rng_context = nullcontext()
-
-        if self.config.fp8:
-            import transformer_engine  # To keep out TE dependency when not training in fp8
-
-            if self.config.fp8 == "e4m3":
-                fp8_format = transformer_engine.common.recipe.Format.E4M3
-            elif self.config.fp8 == "hybrid":
-                fp8_format = transformer_engine.common.recipe.Format.HYBRID
-            else:
-                raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
-
-            fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
-                margin=self.config.fp8_margin,
-                interval=self.config.fp8_interval,
-                fp8_format=fp8_format,
-                amax_compute_algo=self.config.fp8_amax_compute_algo,
-                amax_history_len=self.config.fp8_amax_history_len,
-                override_linear_precision=(False, False, not self.config.fp8_wgrad),
-            )
-            fp8_group = None
-            if parallel_state.model_parallel_is_initialized():
-                fp8_group = parallel_state.get_amax_reduction_group()
-            fp8_context = transformer_engine.pytorch.fp8_autocast(
-                enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group
-            )
-        else:
-            fp8_context = nullcontext()
-
-        with rng_context and fp8_context:
-            # Forward pass.
-            if self.config.recompute_granularity == 'full':
-                hidden_states = self._checkpointed_forward(
-                    hidden_states=hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb,
-                )
-            else:
-                for layer in self.layers:
-                    hidden_states = layer(
-                        hidden_states=hidden_states,
-                        attention_mask=attention_mask,
-                        rotary_pos_emb=rotary_pos_emb,
-                        inference_params=inference_params,
-                    )
-
-        # Final layer norm.
-        if self.post_process and self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
-
-        return hidden_states
-
-    def sharded_state_dict(self, prefix=''):
-
-        sharded_state_dict = {}
-
-        layer_prefix = f'{prefix}layers.'
-        for layer in self.layers:
-            sharded_state_dict.update(layer.sharded_state_dict(prefix=layer_prefix))
-
-        if self.post_process and self.post_layer_norm:
-            state_dict = self.state_dict(keep_vars=True)
-
-            tensor = state_dict['final_layernorm.weight']
-            layer_name = f'{prefix}final_layernorm.weight'
-            sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name)
-
-            # RMSNorm doesn't have bias.
-            if 'final_layernorm.bias' in state_dict.keys():
-                tensor = state_dict['final_layernorm.bias']
-                layer_name = f'{prefix}final_layernorm.bias'
-                sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name)
-
-        return sharded_state_dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py
deleted file mode 100644
index e804a6228e70..000000000000
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# just copy paste here, need work
-from dataclasses import dataclass
-from typing import Callable
-
-import torch
-import torch.nn.functional as F
-
-from megatron.core import ModelParallelConfig
-from megatron.core.utils import init_method_normal, scaled_init_method_normal
-
-
-@dataclass
-class TransformerConfig(ModelParallelConfig):
-    """Configuration object for megatron-core transformers.
-
-        Attributes:
-
-        # model architecture
-        num_layers (int): Number of transformer layers in a transformer block.
-        hidden_size (int): Transformer hidden size.
-        ffn_hidden_size (int): Transformer Feed-Forward Network hidden size.
-                                This is set to 4*hidden_size if not provided. Defaults to None.')
-        num_attention_heads (int): Number of transformer attention heads.
-        kv_channels (int): Projection weights dimension in multi-head attention.
-                            This is set to hidden_size // num_attention_heads if not provided.
-                            Defaults to None.
-        num_query_groups (int): Number of query groups for group query attention. If None, normal attention is used.
-
-        hidden_dropout (float): Dropout probability for transformer hidden state. Defaults to 0.1.
-        attention_dropout (float): Post attention dropout probability. Defaults to 0.1.
-        fp32_residual_connection (bool): If true, move residual connections to fp32.
-        apply_residual_connection_post_layernorm (bool): If true, uses the original BERT residule connection ordering.
-                                                         Defaults to False.
-        layernorm_epsilon (float): Layernorm epsilon. Defaults to 1e-5.
-
-        layernorm_zero_centered_gamma (bool): if set to 'True', the LayerNorm is adjusted to center the gamma values
-                                              around 0. This improves numerical stability. Defaults to False.
-
-        add_bias_linear (bool): Include a bias term in all linear layers (QKV projections, after core attention, and two
-                                in MLP layer). Default is True.
-
-        gated_linear_unit (bool): Use a gated linear unit for the first linear layer in the MLP. Defaults to False.
-
-        activation_func (Callable): Activation function to use for the non-linearity in the MLP. Defaults to F.gelu.
-
-        # initialization
-        init_method (Callable): Method to initialize weights. Note that bias is always set to
-                                zero. Should be a function that takes a single Tensor and
-                                initializes it. Defaults to
-                                megatron.core.utils.init_method_normal(init_method_std) which is
-                                torch.nn.init.normal_ with mean=0.0 and std=init_method_Std.
-
-        output_layer_init_method (Callable): Method to initialize weights of the output layer of
-                                             both attention and MLP blocks. Defaults to
-                                             megatron.core.utils.scaled_init_method_normal(init_method_std)
-                                             which is torch.nn.init.normal_ with mean=0.0 and
-                                             std=init_method_std / math.sqrt(2.0 * num_layers).
-
-        init_method_std (float): Standard deviation of the zero mean normal for the default
-                                 initialization method, not used if init_method and
-                                 output_layer_init_method are provided. Defaults to 0.02.
-
-        # mixed-precision
-        apply_query_key_layer_scaling (bool): If true, scale Q * K^T by 1 / layer-number. Defaults to True.
-        attention_softmax_in_fp32 (bool): If true, run attention masking and softmax in fp32.
-                                          This should be true if apply_query_key_layer_scaling is true.
-
-        # fusion
-        bias_gelu_fustion (bool): If true, fuses bias and gelu. Defaults to False.
-        masked_softmax_fusion (bool): If true, uses softmax fusion.
-        persist_layer_norm (bool): If true, uses the persistent fused layer norm kernel.
-                                   This kernel only supports a fixed set of hidden sizes.
-                                   Defaults to False.
-        bias_dropout_fusion (bool): If true, uses bias dropout fusion.
-
-        # activation recomputation
-
-        recompute_granularity (str): megatron-core supports 'selective' activation checkpointing where only the memory
-                                     intensive part of attention is checkpointed.  These memory intensive activations
-                                     are also less compute intensive which makes activation checkpointing more efficient
-                                     for LLMs (20B+).  See Reducing Activation Recomputation in Large Transformer
-                                     Models: https://arxiv.org/abs/2205.05198 for more details.  'full' will checkpoint
-                                     the entire transformer layer.  Must be 'selective' or 'full'. 'selective' always uses all layers.
-                                     Defaults to None.
-
-        recompute_method (str): uniform will uniformly divide the total number of transformer layers in a transformer
-                                block and recompute the input activation of each divided chunk at the specified
-                                granularity.  block will recompute the input activations for only a set number of
-                                transformer layers per pipeline stage.  The rest of the layers in the pipeline stage
-                                will not have any activations recomputed.  Must be 'uniform' or 'block'. Defaults to
-                                None.
-
-        recompute_num_layers (int): When recompute_method is uniform, recompute_num_layers is the number of transformer
-                                    layers in each uniformly divided recompute unit.  When recompute_method is block,
-                                    recompute_num_layers is the number of transformer layers to recompute within each
-                                    pipeline stage.  Must be None for 'selective' activation checkpointing. Defaults to None.
-
-        distribute_saved_activations (bool): If true, distribute recomputed activations across the model parallel
-                                             group. Defaults to None.
-
-        # fp8 related (via Transformer Engine). For detailed info, refer the the Transformer Engine docs at
-        # https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html
-
-        fp8 (str): If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined choices: (1) 'e4m3'
-                   uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8 activation and weight tensors and
-                   e5m2 for all FP8 output activation gradient tensors. Defaults to None.
-
-        fp8_margin (int): Margin for the scaling factor computation.
-
-        fp8_interval (int): Controls how often the scaling factor is recomputed.
-
-        fp8_amax_history_len (int): The length of the amax history window used for scaling factor computation.
-
-        fp8_amax_compute_algo (str): Algorithm used for choosing the `amax` value for the scaling factor computation.
-                                     There are 2 predefined choices: `max` chooses the largest `amax` in the history
-                                     window, while `most_recent` always chooses the most recently seen value.
-
-        fp8_wgrad (bool): When set to False, override FP8 config options and do the wgrad computation in higher precision.
-                          Defaults to True.
-
-        # Experimental
-        normalization (str): Swtich b/w `LayerNorm` and `RMSNorm` as normalization layers. For now, these are primarily
-                             used by Transformer-Engine's layers like `LayerNormLinear`. Default value is `LayerNorm`.
-
-
-    """
-
-    # model architecture
-    num_layers: int = 0
-    hidden_size: int = 0
-    num_attention_heads: int = 0
-    num_query_groups: int = None
-
-    ffn_hidden_size: int = None
-    kv_channels: int = None
-    hidden_dropout: float = 0.1
-    attention_dropout: float = 0.1
-    fp32_residual_connection: bool = False
-    # @jcasper should we keep this option?
-    apply_residual_connection_post_layernorm: bool = False
-    layernorm_epsilon: float = 1e-5
-    layernorm_zero_centered_gamma: bool = False
-    add_bias_linear: bool = True
-    gated_linear_unit: bool = False
-    activation_func: Callable = F.gelu
-
-    # initialization
-    init_method: Callable = None
-    output_layer_init_method: Callable = None
-    init_method_std: float = 0.02
-
-    # mixed-precision
-    apply_query_key_layer_scaling: bool = True
-    attention_softmax_in_fp32: bool = True
-
-    # communication
-
-    # fusion
-    bias_gelu_fusion: bool = False  # TODO: this should be bias_activation_fusion ?
-    masked_softmax_fusion: bool = False
-    persist_layer_norm: bool = False
-    bias_dropout_fusion: bool = False  # TODO: this should be bias_dropout_add_fusion?
-
-    # activation recomputation
-    recompute_granularity: str = None
-    recompute_method: str = None
-    recompute_num_layers: int = None
-    distribute_saved_activations: bool = None
-
-    # fp8 related
-    fp8: str = None
-    fp8_margin: int = 0
-    fp8_interval: int = 1
-    fp8_amax_history_len: int = 1
-    fp8_amax_compute_algo: str = "most_recent"
-    fp8_wgrad: bool = True
-
-    # experimental section (TODO: move to apt. section above once stable)
-    normalization: bool = "LayerNorm"  # alt value supported by TE: "RMSNorm"
-
-    def __post_init__(self):
-        """ Python dataclass method that is used to modify attributes after initialization.
-            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
-        """
-        super().__post_init__()
-        if self.fp16 and self.bf16:
-            raise ValueError(f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.')
-
-        if self.num_attention_heads % self.tensor_model_parallel_size != 0:
-            raise ValueError(
-                f"num_attention_heads ({self.num_attention_heads}) must be a multiple of "
-                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
-            )
-
-        if self.ffn_hidden_size is None:
-            self.ffn_hidden_size = 4 * self.hidden_size
-
-        if self.kv_channels is None:
-            self.kv_channels = self.hidden_size // self.num_attention_heads
-
-        if self.num_query_groups is None:
-            self.num_query_groups = self.num_attention_heads
-
-        if self.num_query_groups % self.tensor_model_parallel_size != 0:
-            raise ValueError(
-                f"num_query_groups ({self.num_query_groups}) must be a multiple of "
-                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
-            )
-
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-
-        if self.recompute_granularity is not None:
-            if not self.recompute_granularity in ['full', 'selective']:
-                raise ValueError(
-                    f'When using recompute_granuarlity: {self.recompute_granularity} must be "full" or "selective".'
-                )
-
-            if self.recompute_method is not None:
-                if not self.recompute_method in ['block', 'uniform']:
-                    raise ValueError(f'recompute_method: {self.recompute_method} must be "block" or "uniform".')
-            elif self.recompute_granularity != 'selective':
-                raise ValueError(
-                    f'Using recompute_granularity: {self.recompute_granularity} so recompute_method must be "block" or "uniform"'
-                )
-
-            if self.recompute_granularity != 'selective' and self.recompute_num_layers is None:
-                raise ValueError(
-                    f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be between '
-                    f'1 and num_layers_per_pipeline_rank: {self.num_layers // self.pipeline_model_parallel_size}'
-                )
-            elif self.recompute_granularity == 'selective' and self.recompute_num_layers is not None:
-                raise ValueError(
-                    f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be None.'
-                )
-
-            if self.distribute_saved_activations and self.sequence_parallel:
-                raise ValueError(
-                    f'distribute_saved_activations: {self.distribute_saved_activations} must be false when sequence parallel is enabled: {self.sequence_parallel}'
-                )
-
-            if self.virtual_pipeline_model_parallel_size is not None:
-                if not self.num_layers % self.virtual_pipeline_model_parallel_size == 0:
-                    raise ValueError(
-                        f'num_layers: {self.num_layers} must be divisible by virtual_model_parallel_size {self.virtual_pipeline_model_parallel_size}'
-                    )
-
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-
-        if self.bias_gelu_fusion:
-            if not self.add_bias_linear:
-                raise ValueError("When bias_gelu_fusion is True, add_bias_linear must also be True.")
-
-            if self.activation_func != F.gelu:
-                raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.')
-
-        if self.init_method is None:
-            self.init_method = init_method_normal(self.init_method_std)
-
-        if self.output_layer_init_method is None:
-            self.output_layer_init_method = scaled_init_method_normal(self.init_method_std, self.num_layers)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py
deleted file mode 100644
index e592a1e0b1ac..000000000000
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import re
-
-import torch
-from megatron.core import parallel_state
-from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
-from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron.core.transformer.custom_layers.transformer_engine import TENorm
-from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.identity_op import IdentityOp
-from megatron.core.transformer.mlp import MLP
-from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import make_viewless_tensor
-
-# from megatron.core.transformer.attention import SelfAttention
-# change attention due to extra layernorm before mlp, ln_mlp.
-from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_attention import SelfAttention
-
-""" We use the following notation throughout this file:
-     h: hidden size
-     n: number of attention heads
-     p: number of model parallel partitions
-     np: n/p
-     hp: h/p
-     hn: h/n
-     b: batch size
-     s: sequence length
-     l: number of layers
-    Transformer takes input of size [s, b, h] and returns a
-    tensor of the same size. We use the following arguments:
-        hyperparameters: transformer hyperparameters
-"""
-
-
-class FalconTransformerLayer(MegatronModule):
-    """A single transformer layer.
-
-    Transformer layer takes input with size [s, b, h] and returns an
-    output of the same size.
-    
-    Args: 
-        new_decoder_architecture (bool):
-    Whether to use Falcon's new decoder architecture that were used in 7B/40B/180B variants.
-        
-        parallel_attention (bool):
-    Whether to use parallel attention, which computes attention in parallel with feed forward layer.
-        
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        layer_number: int = 1,
-        self_attn_mask_type=AttnMaskType.padding,
-        parallel_attention=False,
-        new_decoder_architecture=False,
-    ):
-        super().__init__(config=config)
-        self.config: TransformerConfig = config
-
-        self.layer_number = layer_number + self._get_layer_offset()
-
-        self.self_attn_mask_type = self_attn_mask_type
-
-        self.new_decoder_architecture = new_decoder_architecture
-        self.parallel_attention = parallel_attention
-
-        # Layernorm on the input data.
-        # TODO: add pytorch only layernorm
-        self.input_layernorm = self._create_identity_op()
-
-        self.mlp_layernorm = self._create_identity_op() if self.new_decoder_architecture else None
-
-        if self.new_decoder_architecture or self.parallel_attention:
-            self.post_self_attn_layernorm = None
-        else:
-            # Layernorm on the attention output
-            self.post_self_attn_layernorm = self._create_identity_op()
-
-        # Self attention.
-        self.self_attention = SelfAttention(
-            config=self.config, layer_number=layer_number, attn_mask_type=self_attn_mask_type,
-        )
-
-        # MLP
-        self.mlp = MLP(config=self.config)
-
-        # @jcasper how should we handle nvfuser?
-        # Set bias+dropout+add fusion grad_enable execution handler.
-        # TORCH_MAJOR = int(torch.__version__.split('.')[0])
-        # TORCH_MINOR = int(torch.__version__.split('.')[1])
-        # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
-        # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
-        self.bias_dropout_add_exec_handler = torch.enable_grad
-
-    def _create_identity_op(self):
-        """Helper function to create an IdentityOp with common parameters."""
-        return IdentityOp(
-            config=self.config,
-            hidden_size=self.config.hidden_size,
-            eps=self.config.layernorm_epsilon,
-            persist_layer_norm=self.config.persist_layer_norm,
-            sequence_parallel=self.config.sequence_parallel,
-            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
-            normalization=self.config.normalization,
-        )
-
-    def _get_layer_offset(self):
-
-        pipeline_rank = parallel_state.get_pipeline_model_parallel_rank()
-
-        num_layers_per_pipeline_rank = (
-            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
-        )
-
-        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
-            vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
-            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
-
-            total_num_layers = self.config.num_layers
-            num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
-            total_virtual_chunks = total_num_layers // vp_size
-            offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank)
-
-        else:
-            # Each stage gets a contiguous set of layers.
-            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-                offset = pipeline_rank * num_layers_per_pipeline_rank
-            else:
-                offset = 0
-
-        return offset
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        encoder_output=None,
-        enc_dec_attn_mask=None,
-        inference_params=None,
-        rotary_pos_emb=None,
-    ):
-        # hidden_states: [s, b, h]
-
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-        input_mlp_ln = layernorm_output
-
-        # Self attention.
-        attention_output_with_bias = self.self_attention(
-            layernorm_output, attention_mask, inference_params=inference_params, rotary_pos_emb=rotary_pos_emb,
-        )
-
-        # Residual connection.
-        if self.config.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        # falcon specific
-        if self.new_decoder_architecture:
-            mlp_ln_output = self.mlp_layernorm(hidden_states)
-
-        bias_dropout_add_func = get_bias_dropout_add(self.training, self.config.bias_dropout_fusion)
-
-        # bias_dropout_add fusion returning fp32 instead of bf16
-        with self.bias_dropout_add_exec_handler():
-            layernorm_input = bias_dropout_add_func(attention_output_with_bias, residual, self.config.hidden_dropout)
-
-        # falcon specific
-        if not self.new_decoder_architecture:
-            if self.parallel_attention:
-                layernorm_output = input_mlp_ln
-            else:
-                layernorm_output = self.post_self_attn_layernorm(layernorm_input)
-                residual = (
-                    layernorm_input if not self.config.apply_residual_connection_post_layernorm else layernorm_output
-                )
-        else:
-            layernorm_output = mlp_ln_output
-
-        # MLP.
-        mlp_output_with_bias = self.mlp(layernorm_output)
-
-        # falcon specific:
-        if self.new_decoder_architecture or self.parallel_attention:
-            mlp_output_with_bias = mlp_output_with_bias + attention_output_with_bias
-
-        with self.bias_dropout_add_exec_handler():
-            output = bias_dropout_add_func(mlp_output_with_bias, residual, self.config.hidden_dropout)
-
-        # Jit compiled function creates 'view' tensor. This tensor
-        # potentially gets saved in the MPU checkpoint function context,
-        # which rejects view tensors. While making a viewless tensor here
-        # won't result in memory savings (like the data loader, or
-        # p2p_communication), it serves to document the origin of this
-        # 'view' tensor.
-        output = make_viewless_tensor(inp=output, requires_grad=output.requires_grad, keep_graph=True)
-
-        return output
-
-    def sharded_state_dict(self, prefix=''):
-
-        # state_dict = self.state_dict(prefix=prefix, keep_vars=True)
-        state_dict = self.state_dict(keep_vars=True)
-
-        tensor_parallel_layers_axis_map = {
-            'self_attention.linear_qkv.weight': 0,
-            'self_attention.linear_qkv.bias': 0,
-            'self_attention.linear_proj.weight': 1,
-            'mlp.linear_fc1.weight': 0,
-            'mlp.linear_fc1.bias': 0,
-            'mlp.linear_fc2.weight': 1,
-        }
-
-        offset = self._get_layer_offset()
-        num_layers = self.config.num_layers
-
-        sharded_state_dict = {}
-
-        for layer_name in state_dict.keys():
-            tensor = state_dict[layer_name]
-            global_layer_offset = self.layer_number - 1  # self.layer_number starts at 1
-            layer_key = f'{prefix}{global_layer_offset - offset}.{layer_name}'  # module list index in TransformerBlock
-            sharded_offsets = [(0, global_layer_offset, num_layers)]  # PP sharding
-
-            if layer_name in tensor_parallel_layers_axis_map:
-                tp_axis = tensor_parallel_layers_axis_map[layer_name]
-                # TP sharding
-                sharded_offsets.append(
-                    [
-                        tp_axis + 1,  # +1 for PP dimension
-                        parallel_state.get_tensor_model_parallel_rank(),
-                        parallel_state.get_tensor_model_parallel_world_size(),
-                    ]
-                )
-                replica_id = parallel_state.get_data_parallel_rank()
-            else:
-                replica_id = (
-                    parallel_state.get_data_parallel_rank() * parallel_state.get_data_parallel_world_size()
-                    + parallel_state.get_tensor_model_parallel_rank()
-                )
-
-            if layer_name.endswith('._extra_state'):
-                sharded_state_dict[layer_key] = ShardedObject(
-                    f'{prefix}{layer_name}', tensor, (num_layers,), (global_layer_offset,), replica_id,
-                )
-
-            else:
-                sharded_state_dict[layer_key] = ShardedTensor.from_rank_offsets(
-                    f'{prefix}{layer_name}',
-                    tensor,
-                    *sharded_offsets,
-                    replica_id=replica_id,
-                    prepend_axis_num=1,  # for PP sharding
-                )
-
-        return sharded_state_dict

From f2ad089c8c7852bd06035a76393919eaba964256 Mon Sep 17 00:00:00 2001
From: vivian <xuanzic@nvidia.com>
Date: Thu, 28 Sep 2023 07:11:20 +0000
Subject: [PATCH 14/69] refactor conversion script to align with others

---
 .../convert_hf_falcon_to_nemo.py              | 482 +++++++-----------
 1 file changed, 192 insertions(+), 290 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index f7995d471e24..5fb198a3fd91 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -23,109 +23,65 @@
 Example to run this conversion script:
 ```
     python convert_hf_falcon_to_nemo.py \
-     --in-file <path_to_hf_checkpoints_folder> \
-     --out-file <path_to_output_nemo_file> \
-     --tokenizer-type <model_id on hf> \
+     --config /path/to/megatron_gpt_config.yaml \
+     --input <path_to_hf_checkpoints_folder> \
+     --output <path_to_output_nemo_file> \
      --precision <precision of converted nemo model>
 ```
 """
 
-import logging
+import argparse
 import os
+from typing import Dict
 import time
-from argparse import ArgumentParser
-from collections import OrderedDict
 
+import pytorch_lightning as pl
 import torch
+import yaml
 from omegaconf import OmegaConf
-from pytorch_lightning.core.saving import _load_state as ptl_load_state
-from pytorch_lightning.trainer.trainer import Trainer
-from transformers import AutoModelForCausalLM, AutoTokenizer, FalconConfig
+from transformers import FalconConfig, AutoModelForCausalLM
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.parts.nlp_overrides import (
-    GradScaler,
-    MegatronHalfPrecisionPlugin,
-    NLPDDPStrategy,
-    NLPSaveRestoreConnector,
-    PipelineMixedPrecisionPlugin,
-)
-
-# TODO:
-# [Y] refactor ckpt func to make it cleaner
-# [Y] dict tokenizer mapping for falcon family
-# [ ] good way to add new_decoder_architecture and parallel_attn in megatron_gpt_config.yaml
-# [ ] safetensors loading. (only 180b used safetensors)
-# [Y] test on non parallel attention model （block by no alibi support? 1b-rw good, 7b-rw still some time)
-# [Y] hf config name mapping for falcon 7b and 40b.
-# [Y] trust remote code add
-# [Y] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA)
-# [Y] When bias_gelu_fusion is True, add_bias_linear must also be True. error
-# [Y] update save_to and restore_from for dist checkpointing
-# [ ] remove unnecessary comments and codes.
-
-
-def setup_logging(log_file="test.log"):
-    logging.basicConfig(
-        filename=log_file,
-        level=logging.DEBUG,
-        format='%(asctime)s [%(levelname)s] - %(message)s',
-        datefmt='%d-%b-%y %H:%M:%S',
-    )
-
-
-def get_args():
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--in-file", type=str, default=None, required=True, help="Path to Huggingface Falcon checkpoints",
-    )
-    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.")
-    parser.add_argument("--precision", type=str, default="32", help="Model precision")
-    parser.add_argument(
-        "--tokenizer-type",
-        type=str,
-        default="tiiuae/falcon-7b",
-        help="Tokenizer type to use, e.g., 'tiiuae/falcon-7b'.",
-    )
-    args = parser.parse_args()
-    return args
-
-
-def load_model(cls, checkpoint, strict, **kwargs):
-    try:
-        if 'cfg' in kwargs:
-            model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs)
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils import logging
+
+def convert_state_dict(state_dict: Dict[str, torch.Tensor], amp: bool = False):
+    def get_new_key(old_key):
+        if old_key == "transformer.word_embeddings.weight":
+            return "embedding.word_embeddings.weight"
+        elif old_key.startswith("transformer.ln_f"):
+            return old_key.replace("transformer.ln_f", "decoder.final_layernorm")
+        elif old_key.startswith("lm_head"):
+            return old_key.replace("lm_head", "output_layer")
+        
+        # For the rest, a base transformation
+        key = old_key.replace("transformer.h", "decoder.layers")
+        
+        # Handling the layer normalization replacements
+        if falcon_config.new_decoder_architecture:
+            key = key.replace("ln_attn", "input_layernorm")
+            key = key.replace("ln_mlp", "pre_mlp_layernorm")
         else:
-            model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs)
-            for name, module in model.named_parameters():
-                if name in checkpoint['state_dict']:
-                    module.data = checkpoint['state_dict'][name]
-                    checkpoint['state_dict'].pop(name)
-                else:
-                    print(f"Unexpected key: {name} not in checkpoint but in model.")
-
-            for name, buffer in model.named_buffers():
-                if name in checkpoint['state_dict']:
-                    buffer.data = checkpoint['state_dict'][name]
-                    checkpoint['state_dict'].pop(name)
+            key = key.replace("input_layernorm", "input_layernorm")
+            if not falcon_config.parallel_attn:
+                key = key.replace("post_attention_layernorm", "post_self_attn_layernorm")
+            
+        key = key.replace("self_attention.dense", "self_attention.linear_proj")
+        key = key.replace("self_attention.query_key_value", "self_attention.linear_qkv")
+        key = key.replace("dense_h_to_4h", "linear_fc1")
+        key = key.replace("dense_4h_to_h", "linear_fc2")
+        return key
 
-            if len(checkpoint['state_dict'].keys()) != 0:
-                raise RuntimeError(
-                    f"Additional keys: {checkpoint['state_dict'].keys()} in checkpoint but not in model."
-                )
+    new_dict = {}
+    # amp O2 mode has different state dict name
+    prefix = "model.module." if amp else "model."
 
-            # register the artifacts
-            cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY]
-            if cfg.tokenizer.model is not None:
-                model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model)
-            if cfg.tokenizer.vocab_file is not None:
-                model.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file)
-            if cfg.tokenizer.merge_file is not None:
-                model.register_artifact("tokenizer.merge_file", cfg.tokenizer.merge_file)
-    finally:
-        cls._set_model_restore_state(is_being_restored=False)
-    return model
+    for old_key, val in state_dict.items():
+        new_key = get_new_key(old_key)
+        new_key = prefix + new_key
+        new_dict[new_key] = val
 
+    return new_dict
 
 def load_falcon_config(args) -> FalconConfig:
     """ Helper utility to load FalconConfig.
@@ -134,8 +90,7 @@ def load_falcon_config(args) -> FalconConfig:
     `transformers.FalconModel`. need to manually set the config values
     and force to `falcon` model type. 
     """
-    config = FalconConfig.from_pretrained(args.in_file)
-
+    config = FalconConfig.from_pretrained(args.input)
     if config.model_type == 'RefinedWeb':
         mappings = {
             "num_hidden_layers": config.n_layer,
@@ -159,229 +114,176 @@ def load_falcon_config(args) -> FalconConfig:
     config.model_type = 'falcon'
     return config
 
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", type=str, required=True, help="Path to the megatron_gpt_config.yaml file"
+    )
+    parser.add_argument(
+        "--input", type=str, required=True, help="Falcon variants from HuggingFace hub or local dir with downloaded model"
+    )
+    parser.add_argument(
+        "--output", type=str, default=".", help="Path to dir where to store output .nemo file"
+    )
+    parser.add_argument(
+        "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
+    )
+    parser.add_argument(
+        "--cuda", action="store_true", help="Put Nemo model onto GPU prior to saving"
+    )
+    
+    args = parser.parse_args()
 
-def load_nemo_config(args):
+    if not os.path.isdir(args.output):
+        raise FileNotFoundError(f"Output directory '{args.output}' does not exist")
+    
     falcon_config = load_falcon_config(args)
     logging.info(f"falcon_config, {falcon_config}")
-    nemo_config = OmegaConf.load(
-        os.path.join(os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_gpt_config.yaml')
-    ).model
-    nemo_config.encoder_seq_length = falcon_config.max_position_embeddings
-    nemo_config.num_layers = int(falcon_config.num_hidden_layers)
-    nemo_config.hidden_size = falcon_config.hidden_size
-    nemo_config.num_attention_heads = falcon_config.num_attention_heads
-    nemo_config.max_position_embeddings = falcon_config.max_position_embeddings
-    nemo_config.init_method_std = falcon_config.initializer_range
-    nemo_config.layernorm_epsilon = falcon_config.layer_norm_epsilon
-    try:
-        if falcon_config.alibi:
+    with open(args.config, "r", encoding="utf_8") as f:
+        orig_cfg = yaml.safe_load(f)
+    
+    model_dict = orig_cfg["model"]
+    
+    if "data" in model_dict:
+        del model_dict["data"]
+    
+    override_model_dict = {
+        "micro_batch_size": 1,
+        "global_batch_size": 1,
+        "tensor_model_parallel_size": 1,
+        "pipeline_model_parallel_size": 1,
+        "megatron_amp_O2": False,
+        "transformer_engine": True,
+        "use_cpu_initialization": not args.cuda,
+        "normalization": "layernorm",
+        "mcore_gpt": True,
+        "num_query_groups": None,  # MHA
+        "hidden_size": falcon_config.hidden_size,
+        "encoder_seq_length": falcon_config.max_position_embeddings,
+        "max_position_embeddings": falcon_config.max_position_embeddings,
+        "num_layers": falcon_config.num_hidden_layers,
+        "num_attention_heads": falcon_config.num_attention_heads,
+        "ffn_hidden_size": falcon_config.hidden_size * 4,
+        "layernorm_epsilon": falcon_config.layer_norm_epsilon,
+        "pre_process": True,
+        "post_process": True,
+        "apply_query_key_layer_scaling": False,
+        "bias": falcon_config.bias,
+        "transformer_block_type": "pre_ln",
+        "fp32_residual_connection": False,
+        "hidden_dropout": falcon_config.hidden_dropout,
+        "attention_dropout": falcon_config.attention_dropout,
+        "ffn_dropout": 0,
+        "share_embeddings_and_output_weights": False,
+        "position_embedding_type": "rope",
+        "precision": args.precision,
+        "init_method_std": falcon_config.initializer_range,
+        "new_decoder_architecture": falcon_config.new_decoder_architecture,
+        "parallel_attention": falcon_config.parallel_attn,
+        "activation": "gelu",
+        "bias_activation_fusion": False,
+        "bias_dropout_add_fusion": False,
+        "seq_len_interpolation_factor": None,
+    }
+    tokenizer_dict = {
+        "library": "huggingface",
+        "type": args.input,
+        "use_fast": True,
+    }
+    trainer_dict = {
+        "devices": 1,
+        "num_nodes": 1,
+        "accelerator": "gpu" if args.cuda else "cpu",
+        "precision": args.precision,
+        "logger": False,
+        "enable_checkpointing": False,
+        "max_epochs": -1,
+        "max_steps": 100000,
+        "log_every_n_steps": 10,
+        "val_check_interval": 100,
+        "limit_val_batches": 50,
+        "limit_test_batches": 500,
+        "accumulate_grad_batches": 1,
+        "gradient_clip_val": 1.0,
+        "benchmark": False,
+        "enable_model_summary": False,
+        "strategy": NLPDDPStrategy(),
+    }
+
+    # Additional logic for position_embedding_type = alibi
+    if falcon_config.alibi:
+        try:
             raise ValueError(
                 "Alibi is not yet supported in Megatron Core, \
                 force to use RoPE will generate suboptimal responses"
             )
-    except ValueError as e:
-        print(e)
-    finally:
-        nemo_config.position_embedding_type = 'rope'
-    nemo_config.bias = falcon_config.bias
-    nemo_config.hidden_dropout = falcon_config.hidden_dropout
-    nemo_config.attention_dropout = falcon_config.attention_dropout
-    # TODO: how does vocab_file, merge_file etc get mapped automatically in respect to variants of falcon models?
-    tokenizer_dict = {
-        'library': 'huggingface',
-        'type': args.tokenizer_type,  # FIXME: can it work from local args.input too, fix for falcon family?
-    }
-
-    nemo_config.tokenizer = tokenizer_dict
-
-    nemo_config.new_decoder_architecture = falcon_config.new_decoder_architecture #bool, if True, always use parallel attn
-    nemo_config.parallel_attention = falcon_config.parallel_attn
+        except ValueError as e:
+            print(e)
 
-    nemo_config.num_query_groups = (
-        falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None
-    )
-    nemo_config.use_cpu_initialization = True
-    nemo_config.activation = 'gelu'
+    # Additional logic for num_query_groups
+    if override_model_dict.get("num_query_groups") is None:
+        override_model_dict["num_query_groups"] = (
+            falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None
+        )
+        
+    # Additional logic for bias fusion
+    if falcon_config.bias:
+        override_model_dict["bias_activation_fusion"] = True
+        override_model_dict["bias_dropout_add_fusion"] = True
+    
+    # Addtional logic for rope scaling
     if falcon_config.rope_scaling is not None:
         if falcon_config.rope_scaling.type == 'linear':
-            nemo_config['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor
+            override_model_dict['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor
         else:
             raise ValueError("Only linear rope scaling type is supported now")
-
-    nemo_config.mcore_gpt = True
-    nemo_config.transformer_engine = True
-    nemo_config.bias_activation_fusion = False
-    nemo_config.bias_dropout_add_fusion = False
-    nemo_config.share_embeddings_and_output_weights = False
-
-    base = 128
-    while falcon_config.vocab_size % base != 0:
-        base //= 2
-    nemo_config.make_vocab_size_divisible_by = base
-
-    return nemo_config
-
-
-def determine_precision(args):
-    """Helper function to determine the precision of model
-    """
-    if args.precision in ["32", "16"]:
-        return int(args.precision)
-    elif args.precision in ["bf16", "bf16-mixed"]:
-        if not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()):
-            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
-            return args.precision[2:]  # prune 'bf' from string
-    return args.precision
-
-
-def determine_dtype(precision):
-    dtype_map = {
-        "32": torch.float32,
-        "16": torch.float16,
-        "16-mixed": torch.float16,
-        "bf16": torch.bfloat16,
-        "bf16-mixed": torch.bfloat16,
-    }
-    return dtype_map.get(precision, torch.float32)  # default to torch.float32
-
-
-def convert(args):
-    logging.info(f"loading checkpoint {args.in_file}")
-    tik = time.time()
-    model = AutoModelForCausalLM.from_pretrained(args.in_file, trust_remote_code=True)
-    falcon_config = load_falcon_config(args)
-    # debug
-    logging.debug(f"initial falcon_config, {falcon_config}")
-
-    nemo_config = load_nemo_config(args)
-    # debug
-    logging.debug(f"initial nemo_config, {nemo_config}")
-    precision = determine_precision(args)
-
-    plugins = []
-
-    if precision in ['16', '16-mixed', 'bf16', 'bf16-mixed']:
-        scaler_params = {
-            'init_scale': nemo_config.get('native_amp_init_scale', 2 ** 32),
-            'growth_interval': nemo_config.get('native_amp_growth_interval', 1000),
-            'hysteresis': nemo_config.get('hysteresis', 2),
-        }
-
-        plugin_precision = '16-mixed' if precision in ['16', '16-mixed'] else 'bf16-mixed'
-        scaler = GradScaler(**scaler_params) if precision in ['16', '16-mixed'] else None
-
-    dtype = determine_dtype(precision)
-    nemo_config.precision = precision
-    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy())
+        
     
-    hidden_size = falcon_config.hidden_size
-    head_num = falcon_config.num_attention_heads
-    head_size = hidden_size // head_num
-    num_layers = falcon_config.num_hidden_layers
-
-    #  - MHA: num_heads = num_kv_heads
-    #  - Multi-Query Attention: num_kv_heads = 1
-    #  - Grouped-Query Attention: num_heads % num_kv_heads = 0
-    num_query_groups = (
-        nemo_config.num_query_groups
-        if nemo_config.num_query_groups and nemo_config.num_query_groups != head_num
-        else head_num
-    )
-    assert (
-        head_num % num_query_groups == 0
-    ), f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})'
+    model_dict.update(override_model_dict)
+    model_dict["tokenizer"] = tokenizer_dict
+    model_dict["name"] = 'megatron_falcon_gpt'
 
-    param_to_weights = lambda param: param.float()
-
-    checkpoint = OrderedDict()
-    checkpoint['state_dict'] = OrderedDict()
-
-    def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias):
-        source_name = f"{source_prefix}.{weight_or_bias}"
-        if source_name in model.state_dict():
-            target_name = f"{target_prefix}.{weight_or_bias}"
-            checkpoint['state_dict'][target_name] = param_to_weights(model.state_dict()[source_name])
-
-    def add_weight_and_possible_bias(source_prefix, target_prefix):
-        add_to_checkpoint(source_prefix, target_prefix, 'weight')
-        if f"{source_prefix}.bias" in model.state_dict():
-            add_to_checkpoint(source_prefix, target_prefix, 'bias')
-
-    add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight')
+    omega_cfg = OmegaConf.create(model_dict)
+    
+    # output_path = "./falcon_megatron_config.yaml"
+    # OmegaConf.save(config=omega_cfg, f=output_path)
 
-    for l in range(int(num_layers)):
-        print(f"converting layer {l}")
-        prefix = f'transformer.h.{l}'
+    trainer = pl.Trainer(**trainer_dict)
 
-        add_weight_and_possible_bias(
-            f'{prefix}.self_attention.query_key_value', f'model.decoder.layers.{l}.self_attention.linear_qkv'
-        )
-        add_weight_and_possible_bias(
-            f'{prefix}.self_attention.dense', f'model.decoder.layers.{l}.self_attention.linear_proj'
-        )
-        add_weight_and_possible_bias(f'{prefix}.mlp.dense_h_to_4h', f'model.decoder.layers.{l}.mlp.linear_fc1')
-        add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2')
-
-        if falcon_config.new_decoder_architecture:
-            add_weight_and_possible_bias(
-                f'{prefix}.ln_attn',
-                f'model.decoder.layers.{l}.input_layernorm',
-            )
-            add_weight_and_possible_bias(
-                f'{prefix}.ln_mlp', 
-                f'model.decoder.layers.{l}.pre_mlp_layernorm', 
-            )
-        else:
-            add_weight_and_possible_bias(
-                f'{prefix}.input_layernorm',
-                f'model.decoder.layers.{l}.input_layernorm',
-            )
-            if not falcon_config.parallel_attn:
-                add_weight_and_possible_bias(
-                    f'{prefix}.post_attention_layernorm',
-                    f'model.decoder.layers.{l}.post_self_attn_layernorm',
-                )
-
-        print(f"done layer {l}")
+    logging.info("Creating Megatron model...")
+    tik = time.time()
+    model = MegatronGPTModel(omega_cfg, trainer)
+    logging.info(f"Created model:\n{model}")
 
-    # final layer norm
-    add_weight_and_possible_bias('transformer.ln_f', 'model.decoder.final_layernorm')
+    logging.info("Loading HuggingFace model...")
+    model_hf = AutoModelForCausalLM.from_pretrained(args.input, trust_remote_code=True)
+    logging.info(f"Loaded model:\n{model_hf}")
 
-    # LM weight
-    add_to_checkpoint('lm_head', 'model.output_layer', 'weight')
+    state_dict_hf = model_hf.state_dict()
+    convert_dict = convert_state_dict(state_dict_hf, amp=omega_cfg.megatron_amp_O2)
 
-    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
-    #logging.debug(f'final checkpoint, {checkpoint}')
+    logging.info("Loading state dict...")
+    missing_keys, unexpected_keys = model.load_state_dict(convert_dict, strict=False)
 
-    del model
-    
-    # state dict name for megatron_amp_O2 is different
-    if nemo_config.get('megatron_amp_O2', False):
-        keys = list(checkpoint['state_dict'].keys())
-        for key in keys:
-            checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
+    if missing_keys:
+        # Keys ending with '_extra_state' are related to Transformer Engine internals
+        missing_keys_non_extra = [key for key in missing_keys if not key.endswith("_extra_state")]
+        if missing_keys_non_extra:
+            logging.critical("Missing keys were detected during the load, something has gone wrong. Aborting.")
+            raise RuntimeError(f"Missing keys: \n{missing_keys_non_extra}")
 
-    #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
-    model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer)
+    if unexpected_keys:
+        logging.critical("Unexpected keys were detected which should not happen. Aborting.")
+        raise RuntimeError(f"Unexpected keys: \n{unexpected_keys}")
 
-    model._save_restore_connector = NLPSaveRestoreConnector()
+    logging.info("Saving model...")
 
-    # cast to target precision and disable cpu init
+    dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32
     model = model.to(dtype=dtype)
-    model.cfg.use_cpu_initialization = False
-    # We make sure that the tokenizer can be instantiated later regardless of args.input
-    model.cfg.tokenizer.update(type=args.tokenizer_type)
-    # save model
-    
-    model.save_to(args.out_file)
-    logging.info(f'NeMo model saved to: {args.out_file}')
-    
+    model.cfg.update(use_cpu_initialization=False)
+    name_last_part = os.path.basename(args.input.rstrip('/'))
+    model.save_to(os.path.join(args.output, f'falcon_{name_last_part}_{args.precision}_tp1_pp1.nemo'))
+    logging.info("Done.")
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    logging.info(f'Weights loaded and saved. Total time: {t}')
-
-
-if __name__ == '__main__':
-    setup_logging()
-    args = get_args()
-    convert(args)
+    logging.info(f'nemo model created and saved. Total time: {t}')
\ No newline at end of file

From 47d2f23fd50f4627f9804da273d28c0c2a349dc3 Mon Sep 17 00:00:00 2001
From: vivian <xuanzic@nvidia.com>
Date: Thu, 28 Sep 2023 07:16:46 +0000
Subject: [PATCH 15/69] add support for falcon-rw model (normal gpt
 architecture)

---
 .../spec_falcon/spec_falcon_decoder_layer.py         | 12 +++++++++---
 .../models/language_modeling/megatron_gpt_model.py   |  4 ++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py
index 68f417b04409..a7bb1a588e40 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py
@@ -67,9 +67,15 @@ def __init__(
 
         self.self_attn_mask_type = self_attn_mask_type
 
-        self.new_decoder_architecture = self.config.new_decoder_architecture
-
-        self.parallel_attention = self.config.parallel_attention
+        if hasattr(self.config, 'new_decoder_architecture'):
+            self.new_decoder_architecture = self.config.new_decoder_architecture
+        else:
+            self.new_decoder_architecture = None  
+        
+        if hasattr(self.config, 'parallel_attention'):
+            self.parallel_attention = self.config.parallel_attention
+        else:
+            self.parallel_attention = None 
 
 
         ## [Module 1: Input Layernorm] Optional Layernorm on the input data
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index cf640a813dae..464457a13632 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -235,6 +235,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
         self.mcore_gpt = cfg.get('mcore_gpt', False)
         # Falcon specific args
+        self.falcon_name = cfg.get('name', 'megatron_falcon_gpt')
         self.new_decoder_architecture = cfg.get('new_decoder_architecture', False)
         self.parallel_attention = cfg.get('parallel_attention', False)
         
@@ -325,7 +326,7 @@ def get_inference_config(self):
 
     def model_provider_func(self, pre_process, post_process):
         """Model depends on pipeline paralellism."""
-        if self.mcore_gpt and (self.new_decoder_architecture or self.parallel_attention):
+        if self.mcore_gpt and self.falcon_name:
             FalconGPTModel, falcon_layer_spec = import_falcon_gpt_model()
             transformer_layer_spec = falcon_layer_spec
             #debug
@@ -344,7 +345,6 @@ def model_provider_func(self, pre_process, post_process):
                 seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
             )
             
-            logging.info(f'model architecture is {model}')
             
         elif self.mcore_gpt:
             transformer_layer_spec = gpt_layer_with_transformer_engine_spec

From ed8869a81652efad710bb538481c2510a5f08087 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Tue, 3 Oct 2023 03:15:00 +0000
Subject: [PATCH 16/69] modify falcon 7b config and remove trust remote code
 due to HF code changes

---
 .../nlp_language_modeling/convert_hf_falcon_to_nemo.py   | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index 5fb198a3fd91..a47ec258662e 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -222,9 +222,10 @@ def load_falcon_config(args) -> FalconConfig:
 
     # Additional logic for num_query_groups
     if override_model_dict.get("num_query_groups") is None:
-        override_model_dict["num_query_groups"] = (
-            falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None
-        )
+        if falcon_config.new_decoder_architecture:
+            override_model_dict["num_query_groups"] = falcon_config.num_kv_heads
+        elif falcon_config.multi_query:
+            override_model_dict["num_query_groups"] = 1
         
     # Additional logic for bias fusion
     if falcon_config.bias:
@@ -256,7 +257,7 @@ def load_falcon_config(args) -> FalconConfig:
     logging.info(f"Created model:\n{model}")
 
     logging.info("Loading HuggingFace model...")
-    model_hf = AutoModelForCausalLM.from_pretrained(args.input, trust_remote_code=True)
+    model_hf = AutoModelForCausalLM.from_pretrained(args.input)
     logging.info(f"Loaded model:\n{model_hf}")
 
     state_dict_hf = model_hf.state_dict()

From 59e0f2ed79040a145e3f9fb16ff5540413c36f9a Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Tue, 3 Oct 2023 03:32:53 +0000
Subject: [PATCH 17/69] rename falcon implementation dir

---
 .../megatron/{spec_falcon => falcon}/__init__.py                  | 0
 .../megatron/{spec_falcon => falcon}/falcon_gpt_model.py          | 0
 .../megatron/{spec_falcon => falcon}/falcon_spec.py               | 0
 .../megatron/{spec_falcon => falcon}/spec_falcon_decoder_block.py | 0
 .../megatron/{spec_falcon => falcon}/spec_falcon_decoder_layer.py | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename nemo/collections/nlp/models/language_modeling/megatron/{spec_falcon => falcon}/__init__.py (100%)
 rename nemo/collections/nlp/models/language_modeling/megatron/{spec_falcon => falcon}/falcon_gpt_model.py (100%)
 rename nemo/collections/nlp/models/language_modeling/megatron/{spec_falcon => falcon}/falcon_spec.py (100%)
 rename nemo/collections/nlp/models/language_modeling/megatron/{spec_falcon => falcon}/spec_falcon_decoder_block.py (100%)
 rename nemo/collections/nlp/models/language_modeling/megatron/{spec_falcon => falcon}/spec_falcon_decoder_layer.py (100%)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py
similarity index 100%
rename from nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py
rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
similarity index 100%
rename from nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py
rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
similarity index 100%
rename from nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py
rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py
similarity index 100%
rename from nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py
rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_layer.py
similarity index 100%
rename from nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py
rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_layer.py

From 03d06bcd32b187908b1ab82fa93054fc1fd30bf1 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Tue, 3 Oct 2023 03:44:21 +0000
Subject: [PATCH 18/69] change dir name

---
 .../megatron/falcon/falcon_gpt_model.py                | 10 +++++-----
 .../language_modeling/megatron/falcon/falcon_spec.py   |  2 +-
 .../megatron/falcon/spec_falcon_decoder_block.py       |  8 +++++---
 .../nlp/models/language_modeling/megatron_gpt_model.py |  4 ++--
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
index 67b93283b0a4..0d8f2ede9ffc 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
@@ -15,7 +15,7 @@
 from torch import Tensor
 
 # from megatron.core.transformer.transformer_block import TransformerBlock
-from .spec_falcon_decoder_block import FalconTransformerBlock
+from .falcon_decoder_block import FalconTransformerBlock
 
 
 class FalconGPTModel(MegatronModule):
@@ -189,7 +189,7 @@ def forward(
         if self.share_embeddings_and_output_weights:
             output_weight = self.shared_embedding_or_output_weight()
         logits, _ = self.output_layer(hidden_states, weight=output_weight)
-
+    
         if labels is None:
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
@@ -197,7 +197,7 @@ def forward(
         # [b s] => [s b]
         labels = labels.transpose(0, 1).contiguous()
         loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels)
-
+        
         # [s b] => [b, s]
         loss = loss.transpose(0, 1).contiguous()
         return loss
@@ -247,7 +247,7 @@ def initialize_last_stage_with_word_embeddings(self):
                     weight.data, group=parallel_state.get_embedding_group()
                 )
 
-        elif not getattr(GPTModel, "embedding_warning_printed", False):
+        elif not getattr(FalconGPTModel, "embedding_warning_printed", False):
             logging.getLogger(__name__).warning(
                 "Distributed processes aren't initialized, so the output layer "
                 "is not initialized with weights from the word embeddings. "
@@ -255,7 +255,7 @@ def initialize_last_stage_with_word_embeddings(self):
                 "this needs to be handled manually. If you are training "
                 "something is definitely wrong."
             )
-            GPTModel.embedding_warning_printed = True
+            FalconGPTModel.embedding_warning_printed = True
 
     def sharded_state_dict(self, prefix=''):
         sharded_state_dict = {}
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 70c90e520937..084a82c71c48 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -13,7 +13,7 @@
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec
-from .spec_falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
+from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
 
 # Use this spec for an implementation using modules in TE
 falcon_layer_spec = ModuleSpec(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py
index 8a009c39b59e..2717ff399a63 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py
@@ -12,7 +12,7 @@
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_config import TransformerConfig
-from .spec_falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
+from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
 from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor
 
 
@@ -208,7 +208,7 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p
         hidden_states = make_viewless_tensor(
             inp=hidden_states, requires_grad=True, keep_graph=True,
         )
-
+        
         if self.config.sequence_parallel:
             rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
         else:
@@ -250,13 +250,15 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p
                     rotary_pos_emb=rotary_pos_emb,
                 )
             else:
-                for layer in self.layers:
+                for idx, layer in enumerate(self.layers):
                     hidden_states = layer(
                         hidden_states=hidden_states,
                         attention_mask=attention_mask,
                         rotary_pos_emb=rotary_pos_emb,
                         inference_params=inference_params,
                     )
+                    logging.debug(f"Layer {idx + 1} tensor:", hidden_states)
+                    logging.debug(f"Layer {idx + 1} tensor shape:", hidden_states.shape)
 
         # Final layer norm.
         if self.post_process and self.post_layer_norm:
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 464457a13632..6b6e899a11a2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -106,8 +106,8 @@ def import_falcon_gpt_model():
     """
     try:
         #from megatron.core.models.falcon.falcon_gpt_model import FalconGPTModel
-        from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_gpt_model import FalconGPTModel
-        from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_spec import falcon_layer_spec
+        from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_gpt_model import FalconGPTModel
+        from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import falcon_layer_spec
         return FalconGPTModel, falcon_layer_spec
     except (ImportError, ModuleNotFoundError):
         raise ImportError("Failed to import FalconGPTModel. Please ensure the necessary dependencies are installed.")

From 71b25b81d54b6ac04a8d64183817ffee5ebe1de2 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Tue, 3 Oct 2023 03:50:19 +0000
Subject: [PATCH 19/69] modify block name

---
 .../{spec_falcon_decoder_block.py => falcon_decoder_block.py}     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename nemo/collections/nlp/models/language_modeling/megatron/falcon/{spec_falcon_decoder_block.py => falcon_decoder_block.py} (100%)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
similarity index 100%
rename from nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py
rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py

From 9bb2e32f0a4cc1146a9aa64458213f6e4c9b9895 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Tue, 3 Oct 2023 03:52:08 +0000
Subject: [PATCH 20/69] rename decoder layer

---
 .../{spec_falcon_decoder_layer.py => falcon_decoder_layer.py}     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename nemo/collections/nlp/models/language_modeling/megatron/falcon/{spec_falcon_decoder_layer.py => falcon_decoder_layer.py} (100%)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
similarity index 100%
rename from nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_layer.py
rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py

From d1056032091ce701d08c2defd559b1a78af9a18d Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Tue, 3 Oct 2023 03:53:24 +0000
Subject: [PATCH 21/69] clean up

---
 .../nlp/models/language_modeling/megatron/__init__.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
index b024f0b061c3..bdd9da8799e6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
@@ -15,7 +15,7 @@
 # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel
 
 try:
-    from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_gpt_model import FalconGPTModel
+    from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_gpt_model import FalconGPTModel
     from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 
     HAVE_MEGATRON_CORE = True

From 65fb7266c03afba2c5c0194263879d3d41844318 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Tue, 3 Oct 2023 04:03:59 +0000
Subject: [PATCH 22/69] remove debug

---
 .../language_modeling/megatron/falcon/falcon_decoder_block.py   | 2 --
 .../nlp/models/language_modeling/megatron_gpt_model.py          | 2 --
 2 files changed, 4 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
index 2717ff399a63..52eb22a25f85 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
@@ -257,8 +257,6 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p
                         rotary_pos_emb=rotary_pos_emb,
                         inference_params=inference_params,
                     )
-                    logging.debug(f"Layer {idx + 1} tensor:", hidden_states)
-                    logging.debug(f"Layer {idx + 1} tensor shape:", hidden_states.shape)
 
         # Final layer norm.
         if self.post_process and self.post_layer_norm:
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 6b6e899a11a2..aa68ca44df8a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -329,8 +329,6 @@ def model_provider_func(self, pre_process, post_process):
         if self.mcore_gpt and self.falcon_name:
             FalconGPTModel, falcon_layer_spec = import_falcon_gpt_model()
             transformer_layer_spec = falcon_layer_spec
-            #debug
-            logging.info(f'falcon gpt config, {self.transformer_config}')
             model = FalconGPTModel(
                 config=self.transformer_config,
                 transformer_layer_spec = transformer_layer_spec,

From c4ad769d8994fa56d48f0320aedadc0ebb0de936 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Oct 2023 19:13:37 +0000
Subject: [PATCH 23/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../megatron/falcon/__init__.py               |  2 +-
 .../megatron/falcon/falcon_decoder_block.py   | 18 ++---
 .../megatron/falcon/falcon_decoder_layer.py   | 74 +++++++++----------
 .../megatron/falcon/falcon_gpt_model.py       | 25 ++-----
 .../megatron/falcon/falcon_spec.py            | 10 +--
 .../language_modeling/megatron_gpt_model.py   | 33 +++++----
 .../convert_hf_falcon_to_nemo.py              | 51 +++++++------
 7 files changed, 96 insertions(+), 117 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py
index 46da18a40345..5dd085d829f6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py
@@ -1 +1 @@
-from .falcon_gpt_model import FalconGPTModel
\ No newline at end of file
+from .falcon_gpt_model import FalconGPTModel
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
index 52eb22a25f85..16bda328f38b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
@@ -4,7 +4,6 @@
 from contextlib import nullcontext
 
 import torch
-
 from megatron.core import parallel_state, tensor_parallel
 from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
 from megatron.core.transformer.custom_layers.transformer_engine import TENorm
@@ -12,9 +11,10 @@
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_config import TransformerConfig
-from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
 from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor
 
+from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
+
 
 class FalconTransformerBlock(MegatronModule):
     """Transformer class."""
@@ -205,10 +205,8 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p
         #   likely redundant, since p2p_communication.py (likely originator)
         #   already creates viewless tensors. That said, make_viewless_tensor()
         #   is called here to be future-proof and corner-case-proof.
-        hidden_states = make_viewless_tensor(
-            inp=hidden_states, requires_grad=True, keep_graph=True,
-        )
-        
+        hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True,)
+
         if self.config.sequence_parallel:
             rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
         else:
@@ -245,9 +243,7 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p
             # Forward pass.
             if self.config.recompute_granularity == 'full':
                 hidden_states = self._checkpointed_forward(
-                    hidden_states=hidden_states,
-                    attention_mask=attention_mask,
-                    rotary_pos_emb=rotary_pos_emb,
+                    hidden_states=hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb,
                 )
             else:
                 for idx, layer in enumerate(self.layers):
@@ -283,8 +279,6 @@ def sharded_state_dict(self, prefix=''):
             if 'final_layernorm.bias' in state_dict.keys():
                 tensor = state_dict['final_layernorm.bias']
                 layer_name = f'{prefix}final_layernorm.bias'
-                sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(
-                    tensor, layer_name
-                )
+                sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name)
 
         return sharded_state_dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index a7bb1a588e40..9ff495c87e7e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -31,6 +31,7 @@
         hyperparameters: transformer hyperparameters
 """
 
+
 @dataclass
 class FalconTransformerLayerSubmodules:
     input_layernorm: Union[ModuleSpec, type] = IdentityOp
@@ -38,7 +39,7 @@ class FalconTransformerLayerSubmodules:
     self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp
 
     post_self_attn_layernorm: Union[ModuleSpec, type] = IdentityOp
-    
+
     pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp
     mlp: Union[ModuleSpec, type] = IdentityOp
     mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp
@@ -54,14 +55,13 @@ class FalconTransformerLayer(MegatronModule):
 
     def __init__(
         self,
-        config: TransformerConfig, # should come from FalconTransformerConfig class
+        config: TransformerConfig,  # should come from FalconTransformerConfig class
         submodules: FalconTransformerLayerSubmodules,
         layer_number: int = 1,
         self_attn_mask_type=AttnMaskType.padding,
     ):
         super().__init__(config=config)
         self.config: TransformerConfig = config
-        
 
         self.layer_number = layer_number + self._get_layer_offset()
 
@@ -70,13 +70,12 @@ def __init__(
         if hasattr(self.config, 'new_decoder_architecture'):
             self.new_decoder_architecture = self.config.new_decoder_architecture
         else:
-            self.new_decoder_architecture = None  
-        
+            self.new_decoder_architecture = None
+
         if hasattr(self.config, 'parallel_attention'):
             self.parallel_attention = self.config.parallel_attention
         else:
-            self.parallel_attention = None 
-
+            self.parallel_attention = None
 
         ## [Module 1: Input Layernorm] Optional Layernorm on the input data
         # TODO: add pytorch only layernorm
@@ -92,9 +91,7 @@ def __init__(
         )
 
         ## [Module 2: SelfAttention]
-        self.self_attention = build_module(
-            submodules.self_attention, config=self.config, layer_number=layer_number,
-        )
+        self.self_attention = build_module(submodules.self_attention, config=self.config, layer_number=layer_number,)
 
         ## [Module 3: BiasDropoutFusion] Optional
         self.self_attn_bda = build_module(submodules.self_attn_bda)
@@ -115,23 +112,27 @@ def __init__(
             )
 
         ## [Module 5: pre mlp layernorm] Optional Layernorm before MLP, used in Falcon's new decoder architecture
-        self.pre_mlp_layernorm = build_module(
-            submodules.pre_mlp_layernorm,
-            config=self.config,
-            hidden_size=self.config.hidden_size,
-            eps=self.config.layernorm_epsilon,
-            persist_layer_norm=self.config.persist_layer_norm,
-            sequence_parallel=self.config.sequence_parallel,
-            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
-            normalization=self.config.normalization,
-        ) if self.new_decoder_architecture else None
+        self.pre_mlp_layernorm = (
+            build_module(
+                submodules.pre_mlp_layernorm,
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+                persist_layer_norm=self.config.persist_layer_norm,
+                sequence_parallel=self.config.sequence_parallel,
+                zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+                normalization=self.config.normalization,
+            )
+            if self.new_decoder_architecture
+            else None
+        )
 
         ## [Module 6: MLP block]
         self.mlp = build_module(submodules.mlp, config=self.config)
 
         ## [Module 7: BiasDropoutFusion] Optional
         self.mlp_bda = build_module(submodules.mlp_bda)
-        
+
         # @jcasper how should we handle nvfuser?
         # Set bias+dropout+add fusion grad_enable execution handler.
         # TORCH_MAJOR = int(torch.__version__.split('.')[0])
@@ -151,7 +152,7 @@ def _get_layer_offset(self):
         if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
             vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
             vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
-            
+
             total_num_layers = self.config.num_layers
             num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
             total_virtual_chunks = total_num_layers // vp_size
@@ -163,7 +164,7 @@ def _get_layer_offset(self):
                 offset = pipeline_rank * num_layers_per_pipeline_rank
             else:
                 offset = 0
-                
+
         return offset
 
     def forward(
@@ -176,13 +177,13 @@ def forward(
         rotary_pos_emb=None,
     ):
         # hidden_states: [s, b, h]
-        
+
         # Residual connection.
         residual = hidden_states
-        
+
         if self.new_decoder_architecture:
             mlp_ln_output = self.pre_mlp_layernorm(hidden_states)
-        
+
         # Optional Input Layer norm
         input_layernorm_output = self.input_layernorm(hidden_states)
 
@@ -195,7 +196,7 @@ def forward(
             inference_params=inference_params,
             rotary_pos_emb=rotary_pos_emb,
         )
-        
+
         # TODO: could we move `bias_dropout_add_exec_handler` itself
         # inside the module provided in the `bias_dropout_add_spec` module?
         with self.bias_dropout_add_exec_handler():
@@ -214,11 +215,11 @@ def forward(
             layernorm_output = mlp_ln_output
 
         mlp_output_with_bias = self.mlp(layernorm_output)
-        
+
         # falcon specific:
         if self.new_decoder_architecture or self.parallel_attention:
-            mlp_output= mlp_output_with_bias[0]
-            attn_output= attention_output_with_bias[0]
+            mlp_output = mlp_output_with_bias[0]
+            attn_output = attention_output_with_bias[0]
             mlp_output_without_bias = mlp_output + attn_output
             mlp_output_with_bias = (mlp_output_without_bias, None)
 
@@ -235,9 +236,7 @@ def forward(
         # won't result in memory savings (like the data loader, or
         # p2p_communication), it serves to document the origin of this
         # 'view' tensor.
-        output = make_viewless_tensor(
-            inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True
-        )
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
 
         return output
 
@@ -279,18 +278,13 @@ def sharded_state_dict(self, prefix=''):
                 replica_id = parallel_state.get_data_parallel_rank()
             else:
                 replica_id = (
-                    parallel_state.get_data_parallel_rank()
-                    * parallel_state.get_data_parallel_world_size()
+                    parallel_state.get_data_parallel_rank() * parallel_state.get_data_parallel_world_size()
                     + parallel_state.get_tensor_model_parallel_rank()
                 )
 
             if layer_name.endswith('._extra_state'):
                 sharded_state_dict[layer_key] = ShardedObject(
-                    f'{prefix}{layer_name}',
-                    tensor,
-                    (num_layers,),
-                    (global_layer_offset,),
-                    replica_id,
+                    f'{prefix}{layer_name}', tensor, (num_layers,), (global_layer_offset,), replica_id,
                 )
 
             else:
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
index 0d8f2ede9ffc..05d449544094 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
@@ -62,7 +62,7 @@ def __init__(
         seq_len_interpolation_factor: Optional[float] = None,
     ):
         super(FalconGPTModel, self).__init__(config=config)
-        
+
         self.config: TransformerConfig = config
         self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
         self.vocab_size = vocab_size
@@ -116,8 +116,7 @@ def __init__(
                 bias=False,
                 skip_bias_add=False,
                 gather_output=not self.parallel_output,
-                skip_weight_param_allocation=self.pre_process
-                and self.share_embeddings_and_output_weights,
+                skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights,
             )
 
         if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
@@ -189,7 +188,7 @@ def forward(
         if self.share_embeddings_and_output_weights:
             output_weight = self.shared_embedding_or_output_weight()
         logits, _ = self.output_layer(hidden_states, weight=output_weight)
-    
+
         if labels is None:
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
@@ -197,7 +196,7 @@ def forward(
         # [b s] => [s b]
         labels = labels.transpose(0, 1).contiguous()
         loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels)
-        
+
         # [s b] => [b, s]
         loss = loss.transpose(0, 1).contiguous()
         return loss
@@ -243,9 +242,7 @@ def initialize_last_stage_with_word_embeddings(self):
         if torch.distributed.is_initialized():
             if parallel_state.is_rank_in_embedding_group():
                 weight = self.shared_embedding_or_output_weight()
-                torch.distributed.all_reduce(
-                    weight.data, group=parallel_state.get_embedding_group()
-                )
+                torch.distributed.all_reduce(weight.data, group=parallel_state.get_embedding_group())
 
         elif not getattr(FalconGPTModel, "embedding_warning_printed", False):
             logging.getLogger(__name__).warning(
@@ -262,9 +259,7 @@ def sharded_state_dict(self, prefix=''):
 
         if self.pre_process:
             embedding_prefix = f'{prefix}embedding.'
-            embedding_sharded_state_dict = self.embedding.sharded_state_dict(
-                prefix=embedding_prefix
-            )
+            embedding_sharded_state_dict = self.embedding.sharded_state_dict(prefix=embedding_prefix)
             sharded_state_dict.update(embedding_sharded_state_dict)
 
         decoder_prefix = f'{prefix}decoder.'
@@ -282,9 +277,7 @@ def sharded_state_dict(self, prefix=''):
                     first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
                     dp_rank = parallel_state.get_data_parallel_rank()
                     dp_size = parallel_state.get_data_parallel_world_size()
-                    last_stage_word_emb_replica_id = (
-                        dp_rank + dp_size
-                    )  # copy of first stage embedding
+                    last_stage_word_emb_replica_id = dp_rank + dp_size  # copy of first stage embedding
 
                     sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
                         tensor=tensor,
@@ -296,9 +289,7 @@ def sharded_state_dict(self, prefix=''):
                     sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
 
             else:
-                output_layer_state_dict = self.output_layer.state_dict(
-                    prefix=output_layer_prefix, keep_vars=True
-                )
+                output_layer_state_dict = self.output_layer.state_dict(prefix=output_layer_prefix, keep_vars=True)
                 output_layer_tensor = output_layer_state_dict[output_layer_key]
                 # independent output layer
                 sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 084a82c71c48..be62fcb33bea 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -3,16 +3,17 @@
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
     TEDotProductAttention,
     TELayerNormColumnParallelLinear,
-    TERowParallelLinear,
     TENorm,
-    TEColumnParallelLinear,
+    TERowParallelLinear,
 )
 from megatron.core.transformer.dot_product_attention import DotProductAttention
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec
+
 from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
 
 # Use this spec for an implementation using modules in TE
@@ -33,10 +34,7 @@
         post_self_attn_layernorm=TENorm,
         pre_mlp_layernorm=TENorm,
         mlp=ModuleSpec(
-            module=MLP,
-            submodules=MLPSubmodules(
-                linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,
-            ),
+            module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
         ),
         mlp_bda=get_bias_dropout_add,
     ),
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 69ff32063d72..39dff75be55c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -16,7 +16,7 @@
 import os
 import queue
 import warnings
-from dataclasses import fields, dataclass
+from dataclasses import dataclass, fields
 from functools import partial
 from typing import Any, Dict, Iterator, List, Optional, Union
 
@@ -76,12 +76,12 @@
 try:
     from megatron.core import InferenceParams, parallel_state
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+    from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
+    from megatron.core.transformer.spec_utils import import_module
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import init_method_normal, scaled_init_method_normal
-    from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec
-    from megatron.core.transformer.spec_utils import import_module
 
     # TODO @tmoon: Use once available in Megatron-LM
     # from megatron.core.pipeline_parallel.schedules import DataIteratorList
@@ -103,26 +103,30 @@
 except (ImportError, ModuleNotFoundError):
     HAVE_TE = False
 
+
 def import_falcon_gpt_model():
     """Conditionally import FalconGPTModel.
     """
     try:
-        #from megatron.core.models.falcon.falcon_gpt_model import FalconGPTModel
+        # from megatron.core.models.falcon.falcon_gpt_model import FalconGPTModel
         from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_gpt_model import FalconGPTModel
         from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import falcon_layer_spec
+
         return FalconGPTModel, falcon_layer_spec
     except (ImportError, ModuleNotFoundError):
         raise ImportError("Failed to import FalconGPTModel. Please ensure the necessary dependencies are installed.")
 
-@dataclass      
+
+@dataclass
 class FalconTransformerConfig(TransformerConfig):
     """
     Transformer Config for Falcon Variants
     """
-    
+
     new_decoder_architecture: bool = False
     parallel_attention: bool = False
 
+
 class MegatronGPTExportableModel(torch.nn.Module, Exportable):
     """
     Megatron GPT Wrapper for ONNX export
@@ -239,7 +243,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.falcon_name = cfg.get('name', 'megatron_falcon_gpt')
         self.new_decoder_architecture = cfg.get('new_decoder_architecture', False)
         self.parallel_attention = cfg.get('parallel_attention', False)
-        
+
         self.rampup_batch_size = self.cfg.get('rampup_batch_size', None)
         if self.rampup_batch_size:
             self.prev_consumed_samples = 0
@@ -334,7 +338,7 @@ def model_provider_func(self, pre_process, post_process):
             transformer_layer_spec = falcon_layer_spec
             model = FalconGPTModel(
                 config=self.transformer_config,
-                transformer_layer_spec = transformer_layer_spec,
+                transformer_layer_spec=transformer_layer_spec,
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
                 max_sequence_length=self.cfg.get('encoder_seq_length', 512),
                 pre_process=pre_process,
@@ -345,13 +349,12 @@ def model_provider_func(self, pre_process, post_process):
                 rotary_percent=self.cfg.get('rotary_percentage', 1.0),
                 seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
             )
-            
-            
+
         elif self.mcore_gpt:
             transformer_layer_spec = gpt_layer_with_transformer_engine_spec
             model = MCoreGPTModel(
                 config=self.transformer_config,
-                transformer_layer_spec = transformer_layer_spec,
+                transformer_layer_spec=transformer_layer_spec,
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
                 max_sequence_length=self.cfg.get('encoder_seq_length', 512),
                 pre_process=pre_process,
@@ -1562,7 +1565,7 @@ def build_transformer_config(self) -> TransformerConfig:
         mcore_gpt = self.cfg.get('mcore_gpt', False)
         new_decoder_architecture = self.cfg.get('new_decoder_architecture', False)
         parallel_attention = self.cfg.get('parallel_attention', False)
-        
+
         normalization = self.cfg.get('normalization', 'layernorm')
         if normalization == 'layernorm':
             normalization = 'LayerNorm'
@@ -1644,12 +1647,12 @@ def build_transformer_config(self) -> TransformerConfig:
                     f"The model: {self} does not have field.name: {field.name} in its cfg. "
                     f"Add this key to cfg or config_mapping to make to make it configurable."
                 )
-                
+
         if mcore_gpt and (new_decoder_architecture or parallel_attention):
             transformer_config = FalconTransformerConfig(
                 **transformer_config_dict,
-                new_decoder_architecture = new_decoder_architecture,
-                parallel_attention = parallel_attention,
+                new_decoder_architecture=new_decoder_architecture,
+                parallel_attention=parallel_attention,
             )
         else:
             transformer_config = TransformerConfig(**transformer_config_dict)
diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index a47ec258662e..3438bbddafaf 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -32,19 +32,20 @@
 
 import argparse
 import os
-from typing import Dict
 import time
+from typing import Dict
 
 import pytorch_lightning as pl
 import torch
 import yaml
 from omegaconf import OmegaConf
-from transformers import FalconConfig, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, FalconConfig
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.utils import logging
 
+
 def convert_state_dict(state_dict: Dict[str, torch.Tensor], amp: bool = False):
     def get_new_key(old_key):
         if old_key == "transformer.word_embeddings.weight":
@@ -53,10 +54,10 @@ def get_new_key(old_key):
             return old_key.replace("transformer.ln_f", "decoder.final_layernorm")
         elif old_key.startswith("lm_head"):
             return old_key.replace("lm_head", "output_layer")
-        
+
         # For the rest, a base transformation
         key = old_key.replace("transformer.h", "decoder.layers")
-        
+
         # Handling the layer normalization replacements
         if falcon_config.new_decoder_architecture:
             key = key.replace("ln_attn", "input_layernorm")
@@ -65,7 +66,7 @@ def get_new_key(old_key):
             key = key.replace("input_layernorm", "input_layernorm")
             if not falcon_config.parallel_attn:
                 key = key.replace("post_attention_layernorm", "post_self_attn_layernorm")
-            
+
         key = key.replace("self_attention.dense", "self_attention.linear_proj")
         key = key.replace("self_attention.query_key_value", "self_attention.linear_qkv")
         key = key.replace("dense_h_to_4h", "linear_fc1")
@@ -83,6 +84,7 @@ def get_new_key(old_key):
 
     return new_dict
 
+
 def load_falcon_config(args) -> FalconConfig:
     """ Helper utility to load FalconConfig.
 
@@ -114,39 +116,37 @@ def load_falcon_config(args) -> FalconConfig:
     config.model_type = 'falcon'
     return config
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True, help="Path to the megatron_gpt_config.yaml file")
     parser.add_argument(
-        "--config", type=str, required=True, help="Path to the megatron_gpt_config.yaml file"
-    )
-    parser.add_argument(
-        "--input", type=str, required=True, help="Falcon variants from HuggingFace hub or local dir with downloaded model"
-    )
-    parser.add_argument(
-        "--output", type=str, default=".", help="Path to dir where to store output .nemo file"
+        "--input",
+        type=str,
+        required=True,
+        help="Falcon variants from HuggingFace hub or local dir with downloaded model",
     )
+    parser.add_argument("--output", type=str, default=".", help="Path to dir where to store output .nemo file")
     parser.add_argument(
         "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
     )
-    parser.add_argument(
-        "--cuda", action="store_true", help="Put Nemo model onto GPU prior to saving"
-    )
-    
+    parser.add_argument("--cuda", action="store_true", help="Put Nemo model onto GPU prior to saving")
+
     args = parser.parse_args()
 
     if not os.path.isdir(args.output):
         raise FileNotFoundError(f"Output directory '{args.output}' does not exist")
-    
+
     falcon_config = load_falcon_config(args)
     logging.info(f"falcon_config, {falcon_config}")
     with open(args.config, "r", encoding="utf_8") as f:
         orig_cfg = yaml.safe_load(f)
-    
+
     model_dict = orig_cfg["model"]
-    
+
     if "data" in model_dict:
         del model_dict["data"]
-    
+
     override_model_dict = {
         "micro_batch_size": 1,
         "global_batch_size": 1,
@@ -226,26 +226,25 @@ def load_falcon_config(args) -> FalconConfig:
             override_model_dict["num_query_groups"] = falcon_config.num_kv_heads
         elif falcon_config.multi_query:
             override_model_dict["num_query_groups"] = 1
-        
+
     # Additional logic for bias fusion
     if falcon_config.bias:
         override_model_dict["bias_activation_fusion"] = True
         override_model_dict["bias_dropout_add_fusion"] = True
-    
+
     # Addtional logic for rope scaling
     if falcon_config.rope_scaling is not None:
         if falcon_config.rope_scaling.type == 'linear':
             override_model_dict['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor
         else:
             raise ValueError("Only linear rope scaling type is supported now")
-        
-    
+
     model_dict.update(override_model_dict)
     model_dict["tokenizer"] = tokenizer_dict
     model_dict["name"] = 'megatron_falcon_gpt'
 
     omega_cfg = OmegaConf.create(model_dict)
-    
+
     # output_path = "./falcon_megatron_config.yaml"
     # OmegaConf.save(config=omega_cfg, f=output_path)
 
@@ -287,4 +286,4 @@ def load_falcon_config(args) -> FalconConfig:
     logging.info("Done.")
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    logging.info(f'nemo model created and saved. Total time: {t}')
\ No newline at end of file
+    logging.info(f'nemo model created and saved. Total time: {t}')

From b9264d46a8ab62e11e5abe2cd8c9a336a78fff1f Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Wed, 11 Oct 2023 01:17:02 +0000
Subject: [PATCH 24/69] add proper header

Signed-off-by: Vivian <xuanzic@nvidia.com>
---
 .../megatron/falcon/__init__.py                  | 14 ++++++++++++++
 .../megatron/falcon/falcon_decoder_block.py      | 14 +++++++++++++-
 .../megatron/falcon/falcon_decoder_layer.py      | 14 +++++++++++++-
 .../megatron/falcon/falcon_gpt_model.py          | 16 ++++++++++++++--
 .../megatron/falcon/falcon_spec.py               | 14 ++++++++++++++
 5 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py
index 5dd085d829f6..d6a3184288ce 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py
@@ -1 +1,15 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .falcon_gpt_model import FalconGPTModel
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
index 16bda328f38b..b2ee4882ed46 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import re
 from contextlib import nullcontext
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 9ff495c87e7e..23c74cfaa083 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from dataclasses import dataclass
 from typing import Union
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
index 05d449544094..33369c3c3d97 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
@@ -1,5 +1,17 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# just copy paste here, need work
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import logging
 from typing import Literal, Optional
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index be62fcb33bea..4906442c5426 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear

From 3dcbd384512a8c3fdafa19d686a473f84bb3ce47 Mon Sep 17 00:00:00 2001
From: Huiying Li <huiyingl@nvidia.com>
Date: Thu, 12 Oct 2023 01:05:51 -0700
Subject: [PATCH 25/69] falcon lora mixin to support when non-fused LN linear

---
 .../modules/common/megatron/adapters/mcore_mixins.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index a56318294e38..990bbcf5e94d 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -57,13 +57,21 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
         Derives `query`, `key` and `value` tensors from `hidden_states`.
         """
         # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
-        (mixed_qkv, layernorm_output), _ = self.linear_qkv(hidden_states)
+        linear_qkv_output, _ = self.linear_qkv(hidden_states)
+        layernorm_output = None
+        if isinstance(linear_qkv_output, tuple): #if LN and linear fused, both will be returned
+            mixed_qkv, layernorm_output = linear_qkv_output 
+        else: # otherwise only mixed_qkv
+            mixed_qkv = linear_qkv_output
 
         # LoRA logic
         if self.is_adapter_available():
             lora_kqv_adapter = self.get_adapter_module(AdapterName.LORA_KQV_ADAPTER)
             if lora_kqv_adapter:
-                lora_mixed_qkv = lora_kqv_adapter(layernorm_output)
+                if layernorm_output:
+                    lora_mixed_qkv = lora_kqv_adapter(layernorm_output)
+                else:
+                    lora_mixed_qkv = lora_kqv_adapter(hidden_states)
                 mixed_qkv = mixed_qkv + lora_mixed_qkv
 
         # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]

From a9df5a4ded30ef458d180add1937c9dc41db12d2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 17 Oct 2023 02:47:53 +0000
Subject: [PATCH 26/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../nlp/modules/common/megatron/adapters/mcore_mixins.py    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index 990bbcf5e94d..d23146375802 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -59,9 +59,9 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
         # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
         linear_qkv_output, _ = self.linear_qkv(hidden_states)
         layernorm_output = None
-        if isinstance(linear_qkv_output, tuple): #if LN and linear fused, both will be returned
-            mixed_qkv, layernorm_output = linear_qkv_output 
-        else: # otherwise only mixed_qkv
+        if isinstance(linear_qkv_output, tuple):  # if LN and linear fused, both will be returned
+            mixed_qkv, layernorm_output = linear_qkv_output
+        else:  # otherwise only mixed_qkv
             mixed_qkv = linear_qkv_output
 
         # LoRA logic

From d7129255503a226fa158ab8a1f17caac0fc2fe80 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Tue, 17 Oct 2023 04:42:20 +0000
Subject: [PATCH 27/69] revise jenkinsfile, tokenizer update in convertion
 script, add two falcon config files

Signed-off-by: Vivian <xuanzic@nvidia.com>
---
 Jenkinsfile                                   |  13 +-
 .../conf/megatron_falcon_config.yaml          | 217 ++++++++++++++++++
 .../conf/megatron_falcon_inference.yaml       |  39 ++++
 .../convert_hf_falcon_to_nemo.py              |  30 +--
 4 files changed, 283 insertions(+), 16 deletions(-)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
 create mode 100644 examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml

diff --git a/Jenkinsfile b/Jenkinsfile
index 3d262931915b..50b2ebd225d5 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -59,11 +59,11 @@ pipeline {
 
     stage('Megatron Core installation') {
       steps {
-        // pinned MCore https://github.com/NVIDIA/Megatron-LM/commit/ab0336a5c8eab77aa74ae604ba1e73decbf6d560
+        // pinned MCore https://github.com/NVIDIA/Megatron-LM/commit/954a65b04c01a4986adbad2a7cc9e9a2d094dd77
         // ToT for 23.08 branch
         sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
             cd Megatron-LM && \
-            git checkout ab0336a5c8eab77aa74ae604ba1e73decbf6d560 && \
+            git checkout 954a65b04c01a4986adbad2a7cc9e9a2d094dd77 && \
             pip install -e .'
       }
     }
@@ -135,6 +135,15 @@ pipeline {
             sh 'rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo'
           }
         }
+        stage('Falcon') {
+          steps {
+            sh 'python scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py \
+            --config examples/nlp/language_modeling/conf/megatron_gpt_config.yaml \
+            --input /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
+            --output /home/TestData/nlp/megatron_gpt/falcon-ci-hf'
+            sh 'rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_falcon-ci-hf_bf16_tp1_pp1.nemo'
+          }
+        }
       }
     }
 
diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
new file mode 100644
index 000000000000..b9a38aa5b952
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
@@ -0,0 +1,217 @@
+name: megatron_falcon_gpt
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_falcon_gpt
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+  mcore_gpt: True
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 1 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  # model architecture
+  encoder_seq_length: 2048
+  max_position_embeddings: ${.encoder_seq_length}
+  num_layers: 32 # 7b: 32 | 40b: 60 | 180b: 80
+  hidden_size: 4544 # 7b: 4544 | 40b: 8192 | 180b: 14848
+  ffn_hidden_size: 18176 # Transformer FFN hidden size. Usually 4 * hidden_size. | 7b: 18176 | 40b: 32768 | 180b: 59392
+  num_attention_heads: 71 # 7b: 71 | 40b: 128 | 180b: 232
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.0 # Dropout probability for attention
+  ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+  layernorm_epsilon: 1e-5
+  do_layer_norm_weight_decay: False # True means weight decay on all params
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: False # Whether to use bias terms in all weight matrices.
+  activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope']
+  rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+  attention_type: 'multihead' # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: False # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: 1 # Number of query groups for group query attention. If None, normal attention is used. | 7b: 1 | 40b: 8 | 180b: 8
+  new_decoder_architecture: false
+  parallel_attention: true
+  
+  tokenizer:
+    library: 'huggingface'
+    type: 'tiiuae/falcon-7b'
+    use_fast: True
+
+  # Mixed precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+
+
+  # Miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
+
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Transformer Engine
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin 
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+  data:
+   # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+    # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
+    # data_prefix: ???
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 2
+    dataloader_type: single # cyclic
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
+    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+  
+  optim:
+    name: fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml
new file mode 100644
index 000000000000..298b6a702571
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml
@@ -0,0 +1,39 @@
+inference:
+  greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: False # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  end_strings: ["<|endoftext|>"]  # generation will stop when one of these tokens is generated
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: bf16 # 16, 32, or bf16
+  use_distributed_sampler: False
+
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+gpt_model_file: null  # GPT nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+prompts: # prompts for GPT inference
+  - "Q: How are you?"
+  - "Q: How big is the universe?"
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: False  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
+web_port: 9889 # the port number of the web server
diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index 3438bbddafaf..20098ed5d554 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -15,10 +15,8 @@
 """
 Conversion script to convert Huggingface Falcon 1B/7B/40B/180B checkpoints into nemo checkpoint.
 
-This script will generate a Megatron model with TP=1 and PP=1. If you need different TP/PP
-values, then after running this script, please use the script located below to set the
-TP/PP values you want:
-    NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py
+This script will generate a Megatron model with TP=1 and PP=1. The new dist ckpt format does not require
+user to run additional script to set the TP/PP values manually.
     
 Example to run this conversion script:
 ```
@@ -88,7 +86,7 @@ def get_new_key(old_key):
 def load_falcon_config(args) -> FalconConfig:
     """ Helper utility to load FalconConfig.
 
-    Falcon-7B and Falcon-40B are not compatible with `transformers.FalconConfig` and
+    Legacy Falcon-7B and Falcon-40B are not compatible with `transformers.FalconConfig` and
     `transformers.FalconModel`. need to manually set the config values
     and force to `falcon` model type. 
     """
@@ -138,7 +136,6 @@ def load_falcon_config(args) -> FalconConfig:
         raise FileNotFoundError(f"Output directory '{args.output}' does not exist")
 
     falcon_config = load_falcon_config(args)
-    logging.info(f"falcon_config, {falcon_config}")
     with open(args.config, "r", encoding="utf_8") as f:
         orig_cfg = yaml.safe_load(f)
 
@@ -245,19 +242,14 @@ def load_falcon_config(args) -> FalconConfig:
 
     omega_cfg = OmegaConf.create(model_dict)
 
-    # output_path = "./falcon_megatron_config.yaml"
-    # OmegaConf.save(config=omega_cfg, f=output_path)
-
     trainer = pl.Trainer(**trainer_dict)
 
     logging.info("Creating Megatron model...")
     tik = time.time()
     model = MegatronGPTModel(omega_cfg, trainer)
-    logging.info(f"Created model:\n{model}")
 
     logging.info("Loading HuggingFace model...")
     model_hf = AutoModelForCausalLM.from_pretrained(args.input)
-    logging.info(f"Loaded model:\n{model_hf}")
 
     state_dict_hf = model_hf.state_dict()
     convert_dict = convert_state_dict(state_dict_hf, amp=omega_cfg.megatron_amp_O2)
@@ -277,12 +269,22 @@ def load_falcon_config(args) -> FalconConfig:
         raise RuntimeError(f"Unexpected keys: \n{unexpected_keys}")
 
     logging.info("Saving model...")
-
+    
+    # We make sure that the tokenizer can be instantiated later regardless of args.input
+    if falcon_config.new_decoder_architecture:
+        model.cfg.tokenizer.update(type="tiiuae/falcon-40b")
+    elif falcon_config.multi_query:
+        model.cfg.tokenizer.update(type="tiiuae/falcon-7b")
+    elif falcon_config.alibi and falcon_config.num_hidden_layers == 36:
+        model.cfg.tokenizer.update(type="tiiuae/falcon-rw-7b")
+    else:
+        model.cfg.tokenizer.update(type="tiiuae/falcon-rw-1b")
+    
     dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32
     model = model.to(dtype=dtype)
     model.cfg.update(use_cpu_initialization=False)
-    name_last_part = os.path.basename(args.input.rstrip('/'))
-    model.save_to(os.path.join(args.output, f'falcon_{name_last_part}_{args.precision}_tp1_pp1.nemo'))
+    tokenizer_name_part = model.cfg.tokenizer["type"].split("/")[1]
+    model.save_to(os.path.join(args.output, f'falcon_{tokenizer_name_part}_{args.precision}_tp1_pp1.nemo'))
     logging.info("Done.")
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))

From 7a46e868c6b1b59617142b6af90ca1a417c9ca6e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 17 Oct 2023 04:44:35 +0000
Subject: [PATCH 28/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index 20098ed5d554..275955d5a056 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -269,7 +269,7 @@ def load_falcon_config(args) -> FalconConfig:
         raise RuntimeError(f"Unexpected keys: \n{unexpected_keys}")
 
     logging.info("Saving model...")
-    
+
     # We make sure that the tokenizer can be instantiated later regardless of args.input
     if falcon_config.new_decoder_architecture:
         model.cfg.tokenizer.update(type="tiiuae/falcon-40b")
@@ -279,7 +279,7 @@ def load_falcon_config(args) -> FalconConfig:
         model.cfg.tokenizer.update(type="tiiuae/falcon-rw-7b")
     else:
         model.cfg.tokenizer.update(type="tiiuae/falcon-rw-1b")
-    
+
     dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32
     model = model.to(dtype=dtype)
     model.cfg.update(use_cpu_initialization=False)

From d4a5fec8ebc435160b52f2bd9c60520e07f94c15 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Thu, 2 Nov 2023 22:49:30 -0700
Subject: [PATCH 29/69] refactor falcon to use MCoreGPT+spec+baselayer initial
 commit

---
 .../language_modeling/megatron/__init__.py    |   2 +-
 .../megatron/falcon/__init__.py               |   2 -
 .../megatron/falcon/falcon_decoder_block.py   | 296 ----------------
 .../megatron/falcon/falcon_decoder_layer.py   |  42 +--
 .../megatron/falcon/falcon_gpt_model.py       | 316 ------------------
 .../megatron/falcon/falcon_spec.py            |  41 +--
 .../language_modeling/megatron_gpt_model.py   |  82 ++---
 .../convert_hf_falcon_to_nemo.py              |   9 +-
 8 files changed, 61 insertions(+), 729 deletions(-)
 delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
 delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
index bdd9da8799e6..70b6d4c169b8 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
@@ -15,8 +15,8 @@
 # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel
 
 try:
-    from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_gpt_model import FalconGPTModel
     from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
+    from nemo.collections.nlp.models.language_modeling.megatron.falcon import falcon_spec
 
     HAVE_MEGATRON_CORE = True
 except (ImportError, ModuleNotFoundError):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py
index d6a3184288ce..4fc50543f1d2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .falcon_gpt_model import FalconGPTModel
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
deleted file mode 100644
index b2ee4882ed46..000000000000
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-from contextlib import nullcontext
-
-import torch
-from megatron.core import parallel_state, tensor_parallel
-from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
-from megatron.core.transformer.custom_layers.transformer_engine import TENorm
-from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.spec_utils import ModuleSpec
-from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor
-
-from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
-
-
-class FalconTransformerBlock(MegatronModule):
-    """Transformer class."""
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        transformer_layer_spec: ModuleSpec,
-        self_attn_mask_type=AttnMaskType.padding,
-        post_layer_norm=True,
-        pre_process=True,
-        post_process=True,
-    ):
-        super().__init__(config=config)
-
-        self.config: TransformerConfig = config
-        self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
-
-        self.self_attn_mask_type = self_attn_mask_type
-        self.post_layer_norm = post_layer_norm
-        self.pre_process = pre_process
-        self.post_process = post_process
-
-        # required for pipeline parallel schedules
-        self.input_tensor = None
-
-        self.checkpoint_core_attention = self.config.recompute_granularity == 'selective'
-
-        self.num_layers_per_pipeline_rank = (
-            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
-        )
-
-        self._build_layers(self.transformer_layer_spec)
-
-    def _build_layers(self, transformer_layer_spec):
-        # Transformer layers.
-        # @jcasper can we improve how we deal with layer_number?
-        # currently it's only used in CoreAttention?
-        # if self.apply_query_key_layer_scaling:
-        #     coeff = self.layer_number
-        #     self.norm_factor *= coeff
-        def build_layer(layer_number):
-            layer = FalconTransformerLayer(
-                config=self.config,
-                submodules=transformer_layer_spec.submodules,
-                layer_number=layer_number,
-                self_attn_mask_type=self.self_attn_mask_type,
-            )
-            return layer
-
-        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
-            # Interleaved pipeline parallelism:
-            # Number of layers in each model chunk is the number of layers in the stage,
-            # divided by the number of model chunks in a stage.
-            # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
-            # layers to stages like (each list is a model chunk):
-            # Stage 0: [0]  [2]  [4]  [6]
-            # Stage 1: [1]  [3]  [5]  [7]
-            # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
-            # layers to stages like (each list is a model chunk):
-            # Stage 0: [0, 1]  [4, 5]
-            # Stage 1: [2, 3]  [6, 7]
-
-            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
-
-            num_layers_per_virtual_rank = self.num_layers_per_pipeline_rank // vp_size
-
-            num_layers_to_build = num_layers_per_virtual_rank
-
-        else:
-            # Non-interleaved pipeline parallelism:
-            # Each stage gets a contiguous set of layers.
-
-            num_layers_to_build = self.num_layers_per_pipeline_rank
-
-        # offset is implicit in TransformerLayer
-        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(num_layers_to_build)])
-
-        # # TODO: add back standalone_embedding_stage
-        # if self.num_layers == 0:
-        #     # When a standalone embedding stage is used (e.g.,
-        #     # args.standalone_embedding_stage == True), virtual pipeline ranks
-        #     # on pipeline rank 0 will have zero transformer layers assigned to
-        #     # them. This results in the model's input and output tensors to be
-        #     # the same, which will cause failure for certain output tensor
-        #     # optimizations (e.g., pipeline output deallocation). To remedy
-        #     # this, we assign a 'no-op' layer on these ranks, which will
-        #     # disconnect the input tensor from the output tensor.
-        #     self.num_layers = 1
-        #     self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)])
-        # else:
-        #     self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)])
-
-        if self.post_process and self.post_layer_norm:
-            # Final layer norm before output.
-            self.final_layernorm = TENorm(
-                config=self.config,
-                hidden_size=self.config.hidden_size,
-                eps=self.config.layernorm_epsilon,
-                persist_layer_norm=self.config.persist_layer_norm,
-                sequence_parallel=self.config.sequence_parallel,
-                zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
-                normalization=self.config.normalization,
-            )
-
-    def _get_layer(self, layer_number):
-        return self.layers[layer_number]
-
-    def _checkpointed_forward(self, hidden_states, attention_mask, rotary_pos_emb):
-        """Forward method with activation checkpointing."""
-
-        def custom(start, end):
-            def custom_forward(*args, **kwargs):
-                x_, *args = args
-                for index in range(start, end):
-                    layer = self._get_layer(index)
-                    x_ = layer(x_, *args, **kwargs)
-                return x_
-
-            return custom_forward
-
-        if self.config.recompute_method == 'uniform':
-            # Uniformly divide the total number of Transformer layers and checkpoint
-            # the input activation of each divided chunk.
-            # A method to further reduce memory usage reducing checkpoints.
-            l = 0
-            while l < self.num_layers_per_pipeline_rank:
-                hidden_states = tensor_parallel.checkpoint(
-                    custom(l, l + self.config.recompute_num_layers),
-                    self.config.distribute_saved_activations,
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                )
-
-                l += self.config.recompute_num_layers
-
-        elif self.config.recompute_method == 'block':
-            # Checkpoint the input activation of only a set number of individual
-            # Transformer layers and skip the rest.
-            # A method fully use the device memory removing redundant re-computation.
-            for l in range(self.num_layers_per_pipeline_rank):
-                if l < self.config.recompute_num_layers:
-                    hidden_states = tensor_parallel.checkpoint(
-                        custom(l, l + 1),
-                        self.config.distribute_saved_activations,
-                        hidden_states,
-                        attention_mask,
-                        rotary_pos_emb,
-                    )
-                else:
-                    hidden_states = custom(l, l + 1)(hidden_states, attention_mask, rotary_pos_emb)
-        else:
-            raise ValueError("Invalid activation recompute method.")
-
-        return hidden_states
-
-    def set_input_tensor(self, input_tensor):
-        """Set input tensor to be used instead of forward()'s input.
-
-        When doing pipeline parallelism the input from the previous
-        stage comes from communication, not from the input, so the
-        model's forward_step_func won't have it. This function is thus
-        used by internal code to bypass the input provided by the
-        forward_step_func"""
-        self.input_tensor = input_tensor
-
-    def forward(self, hidden_states, attention_mask, inference_params=None, rotary_pos_emb=None):
-        # hidden_states (float): [s, b, h]
-        # attention_mask (bool): [1, 1, s, s]
-
-        if not self.pre_process:
-            # See set_input_tensor()
-            hidden_states = self.input_tensor
-
-        # Viewless tensor.
-        # - We only need to create a viewless tensor in the case of micro batch
-        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
-        #   above creates a view tensor, and '.contiguous()' is a pass-through.
-        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
-        #   the need to make it viewless.
-        #
-        #   However, we don't explicitly check mbs == 1 here because
-        #   make_viewless_tensor() has negligible overhead when its input
-        #   is already viewless.
-        #
-        # - For the 'else' case above, calling make_viewless_tensor() here is
-        #   likely redundant, since p2p_communication.py (likely originator)
-        #   already creates viewless tensors. That said, make_viewless_tensor()
-        #   is called here to be future-proof and corner-case-proof.
-        hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True,)
-
-        if self.config.sequence_parallel:
-            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
-        else:
-            rng_context = nullcontext()
-
-        if self.config.fp8:
-            import transformer_engine  # To keep out TE dependency when not training in fp8
-
-            if self.config.fp8 == "e4m3":
-                fp8_format = transformer_engine.common.recipe.Format.E4M3
-            elif self.config.fp8 == "hybrid":
-                fp8_format = transformer_engine.common.recipe.Format.HYBRID
-            else:
-                raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
-
-            fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
-                margin=self.config.fp8_margin,
-                interval=self.config.fp8_interval,
-                fp8_format=fp8_format,
-                amax_compute_algo=self.config.fp8_amax_compute_algo,
-                amax_history_len=self.config.fp8_amax_history_len,
-                override_linear_precision=(False, False, not self.config.fp8_wgrad),
-            )
-            fp8_group = None
-            if parallel_state.model_parallel_is_initialized():
-                fp8_group = parallel_state.get_amax_reduction_group()
-            fp8_context = transformer_engine.pytorch.fp8_autocast(
-                enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group
-            )
-        else:
-            fp8_context = nullcontext()
-
-        with rng_context and fp8_context:
-            # Forward pass.
-            if self.config.recompute_granularity == 'full':
-                hidden_states = self._checkpointed_forward(
-                    hidden_states=hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb,
-                )
-            else:
-                for idx, layer in enumerate(self.layers):
-                    hidden_states = layer(
-                        hidden_states=hidden_states,
-                        attention_mask=attention_mask,
-                        rotary_pos_emb=rotary_pos_emb,
-                        inference_params=inference_params,
-                    )
-
-        # Final layer norm.
-        if self.post_process and self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
-
-        return hidden_states
-
-    def sharded_state_dict(self, prefix=''):
-
-        sharded_state_dict = {}
-
-        layer_prefix = f'{prefix}layers.'
-        for layer in self.layers:
-            sharded_state_dict.update(layer.sharded_state_dict(prefix=layer_prefix))
-
-        if self.post_process and self.post_layer_norm:
-            state_dict = self.state_dict(keep_vars=True)
-
-            tensor = state_dict['final_layernorm.weight']
-            layer_name = f'{prefix}final_layernorm.weight'
-            sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name)
-
-            # RMSNorm doesn't have bias.
-            if 'final_layernorm.bias' in state_dict.keys():
-                tensor = state_dict['final_layernorm.bias']
-                layer_name = f'{prefix}final_layernorm.bias'
-                sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name)
-
-        return sharded_state_dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 23c74cfaa083..037305dd6824 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -23,7 +23,7 @@
 from megatron.core.transformer.attention import SelfAttentionSubmodules
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
-from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.base_layer import LayerSubmodules, BaseLayer
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import make_viewless_tensor
@@ -45,7 +45,7 @@
 
 
 @dataclass
-class FalconTransformerLayerSubmodules:
+class FalconTransformerLayerSubmodules(LayerSubmodules):
     input_layernorm: Union[ModuleSpec, type] = IdentityOp
     self_attention: Union[ModuleSpec, type] = IdentityOp
     self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp
@@ -57,7 +57,7 @@ class FalconTransformerLayerSubmodules:
     mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp
 
 
-class FalconTransformerLayer(MegatronModule):
+class FalconTransformerLayer(BaseLayer):
     """A single transformer layer.
 
     Transformer layer takes input with size [s, b, h] and returns an
@@ -72,7 +72,7 @@ def __init__(
         layer_number: int = 1,
         self_attn_mask_type=AttnMaskType.padding,
     ):
-        super().__init__(config=config)
+        super().__init__(config=config, submodules=submodules)
         self.config: TransformerConfig = config
 
         self.layer_number = layer_number + self._get_layer_offset()
@@ -153,40 +153,14 @@ def __init__(
         # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
         self.bias_dropout_add_exec_handler = torch.enable_grad
 
-    def _get_layer_offset(self):
-
-        pipeline_rank = parallel_state.get_pipeline_model_parallel_rank()
-
-        num_layers_per_pipeline_rank = (
-            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
-        )
-
-        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
-            vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
-            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
-
-            total_num_layers = self.config.num_layers
-            num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
-            total_virtual_chunks = total_num_layers // vp_size
-            offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank)
-
-        else:
-            # Each stage gets a contiguous set of layers.
-            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-                offset = pipeline_rank * num_layers_per_pipeline_rank
-            else:
-                offset = 0
-
-        return offset
-
     def forward(
         self,
         hidden_states,
         attention_mask,
-        encoder_output=None,
-        enc_dec_attn_mask=None,
-        inference_params=None,
+        context=None,
+        context_mask=None,
         rotary_pos_emb=None,
+        inference_params=None,
     ):
         # hidden_states: [s, b, h]
 
@@ -250,7 +224,7 @@ def forward(
         # 'view' tensor.
         output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
 
-        return output
+        return output, context
 
     def sharded_state_dict(self, prefix=''):
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
deleted file mode 100644
index 33369c3c3d97..000000000000
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from typing import Literal, Optional
-
-import torch
-from megatron.core import parallel_state, tensor_parallel
-from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding
-from megatron.core.models.gpt.gpt_embedding import GPTEmbedding
-from megatron.core.transformer.enums import AttnMaskType, ModelType
-from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.spec_utils import ModuleSpec
-from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint
-from torch import Tensor
-
-# from megatron.core.transformer.transformer_block import TransformerBlock
-from .falcon_decoder_block import FalconTransformerBlock
-
-
-class FalconGPTModel(MegatronModule):
-    """Transformer language model.
-
-    Arguments:
-        config (TransformerConfig): transformer config
-
-        vocab_size (int): vocabulary size
-
-        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
-
-        pre_process (bool): Include embedding layer (used with pipeline parallelism)
-        post_process (bool): Include an output layer (used with pipeline parallelism)
-
-        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
-
-        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are
-            shared. Defaults to False.
-
-        position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope'].
-            Defaults is 'learned_absolute'.
-
-        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
-            Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
-
-        seq_len_interpolation_factor (float): scale of linearly interpolating RoPE for longer sequences.
-            The value must be a float larger than 1.0. Defaults to None.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        transformer_layer_spec: ModuleSpec,
-        vocab_size: int,
-        max_sequence_length: int,
-        pre_process: bool = True,
-        post_process: bool = True,
-        fp16_lm_cross_entropy: bool = False,
-        parallel_output: bool = True,
-        share_embeddings_and_output_weights: bool = False,
-        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
-        rotary_percent: float = 1.0,
-        seq_len_interpolation_factor: Optional[float] = None,
-    ):
-        super(FalconGPTModel, self).__init__(config=config)
-
-        self.config: TransformerConfig = config
-        self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
-        self.vocab_size = vocab_size
-        self.max_sequence_length = max_sequence_length
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
-        self.parallel_output = parallel_output
-        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
-        self.position_embedding_type = position_embedding_type
-
-        # megatron core pipelining currently depends on model type
-        # TODO: remove this dependency ?
-        self.model_type = ModelType.encoder_or_decoder
-
-        # Embeddings.
-        if self.pre_process:
-            self.embedding = GPTEmbedding(
-                config=self.config,
-                vocab_size=self.vocab_size,
-                max_sequence_length=self.max_sequence_length,
-                add_position_embedding=(self.position_embedding_type == 'learned_absolute'),
-            )
-
-        # Rotary Position Embeddings
-        if self.position_embedding_type == 'rope':
-            rotary_dim = self.config.kv_channels
-            if rotary_percent < 1.0:
-                rotary_dim = int(rotary_dim * rotary_percent)
-
-            self.rotary_pos_emb = RotaryEmbedding(rotary_dim, seq_len_interpolation_factor)
-        else:
-            self.rotary_pos_emb = None
-
-        # Transformer.
-        self.decoder = FalconTransformerBlock(
-            config=self.config,
-            transformer_layer_spec=self.transformer_layer_spec,
-            self_attn_mask_type=AttnMaskType.causal,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-        )
-
-        # Output
-        if post_process:
-            self.output_layer = tensor_parallel.ColumnParallelLinear(
-                config.hidden_size,
-                self.vocab_size,
-                config=config,
-                init_method=config.init_method,
-                bias=False,
-                skip_bias_add=False,
-                gather_output=not self.parallel_output,
-                skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights,
-            )
-
-        if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
-            self.initialize_last_stage_with_word_embeddings()
-
-    def set_input_tensor(self, input_tensor):
-        """ See megatron.model.transformer.set_input_tensor()"""
-
-        # This is usually handled in schedules.py but some inference code still
-        # gives us non-lists or None
-        if not isinstance(input_tensor, list):
-            input_tensor = [input_tensor]
-
-        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt'
-        self.decoder.set_input_tensor(input_tensor[0])
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        position_ids: Tensor,
-        attention_mask: Tensor,
-        decoder_input: Tensor = None,
-        labels: Tensor = None,
-        inference_params=None,
-    ):
-        # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
-        # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
-
-        # Decoder embedding.
-        if decoder_input is not None:
-            pass
-        elif self.pre_process:
-            decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
-        else:
-            # intermediate stage of pipeline
-            # decoder will get hidden_states from encoder.input_tensor
-            decoder_input = None
-
-        # Rotary positional embeddings
-        rotary_pos_emb = None
-        if self.rotary_pos_emb is not None:
-            if inference_params is not None:
-                rotary_seq_len = inference_params.max_sequence_length
-            else:
-                if self.decoder.input_tensor is not None:
-                    rotary_seq_len = self.decoder.input_tensor.size(0)
-                else:
-                    rotary_seq_len = decoder_input.size(0)
-
-                # Decoder input is split along sequence dimension, but RoPE is applied in tensor parallel region
-                if self.config.sequence_parallel:
-                    rotary_seq_len *= self.config.tensor_model_parallel_size
-
-            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
-
-        # Run decoder.
-        hidden_states = self.decoder(
-            hidden_states=decoder_input,
-            attention_mask=attention_mask,
-            inference_params=inference_params,
-            rotary_pos_emb=rotary_pos_emb,
-        )
-
-        if not self.post_process:
-            return hidden_states
-
-        # logits and loss
-        output_weight = None
-        if self.share_embeddings_and_output_weights:
-            output_weight = self.shared_embedding_or_output_weight()
-        logits, _ = self.output_layer(hidden_states, weight=output_weight)
-
-        if labels is None:
-            # [s b h] => [b s h]
-            return logits.transpose(0, 1).contiguous()
-
-        # [b s] => [s b]
-        labels = labels.transpose(0, 1).contiguous()
-        loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels)
-
-        # [s b] => [b, s]
-        loss = loss.transpose(0, 1).contiguous()
-        return loss
-
-    def shared_embedding_or_output_weight(self):
-        if self.pre_process:
-            return self.embedding.word_embeddings.weight
-        elif self.post_process:
-            return self.output_layer.weight
-        return None
-
-    def initialize_last_stage_with_word_embeddings(self):
-
-        # This function just initializes the word embeddings in the final stage
-        # when we are using pipeline parallelism and sharing word
-        # embeddings. Nothing to do if we aren't sharing weights or aren't using
-        # pipeline parallelism.
-        if not self.share_embeddings_and_output_weights or (self.pre_process and self.post_process):
-            return
-
-        if self.post_process and not self.pre_process:
-            assert not parallel_state.is_pipeline_first_stage()
-            # set word_embeddings weights to 0 here, then copy first
-            # stage's weights using all_reduce below.
-            self.output_layer.weight.data.fill_(0)
-            self.output_layer.weight.shared = True
-
-        # Parameters are shared between the word embeddings layers, and the
-        # heads at the end of the model. In a pipelined setup with more than
-        # one stage, the initial embedding layer and the head are on different
-        # workers, so we do the following:
-        # 1. Create a second copy of word_embeddings on the last stage, with
-        #    initial parameters of 0.0.
-        # 2. Do an all-reduce between the first and last stage to ensure that
-        #    the two copies of word_embeddings start off with the same
-        #    parameter values.
-        # 3. In the training loop, before an all-reduce between the grads of
-        #    the two word_embeddings layers to ensure that every applied weight
-        #    update is the same on both stages.
-
-        # Ensure that first and last stages have the same initial parameter
-        # values.
-        if torch.distributed.is_initialized():
-            if parallel_state.is_rank_in_embedding_group():
-                weight = self.shared_embedding_or_output_weight()
-                torch.distributed.all_reduce(weight.data, group=parallel_state.get_embedding_group())
-
-        elif not getattr(FalconGPTModel, "embedding_warning_printed", False):
-            logging.getLogger(__name__).warning(
-                "Distributed processes aren't initialized, so the output layer "
-                "is not initialized with weights from the word embeddings. "
-                "If you are just manipulating a model this is fine, but "
-                "this needs to be handled manually. If you are training "
-                "something is definitely wrong."
-            )
-            FalconGPTModel.embedding_warning_printed = True
-
-    def sharded_state_dict(self, prefix=''):
-        sharded_state_dict = {}
-
-        if self.pre_process:
-            embedding_prefix = f'{prefix}embedding.'
-            embedding_sharded_state_dict = self.embedding.sharded_state_dict(prefix=embedding_prefix)
-            sharded_state_dict.update(embedding_sharded_state_dict)
-
-        decoder_prefix = f'{prefix}decoder.'
-        decoder_sharded_state_dict = self.decoder.sharded_state_dict(prefix=decoder_prefix)
-        sharded_state_dict.update(decoder_sharded_state_dict)
-
-        if self.post_process:
-            output_layer_prefix = f'{prefix}output_layer.'
-            output_layer_key = f'{output_layer_prefix}weight'
-            if self.share_embeddings_and_output_weights:
-                if not self.pre_process:
-                    # when sharing embeddings with last stage, we need to use the weights from the first stage
-                    # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight
-                    tensor = self.shared_embedding_or_output_weight()
-                    first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
-                    dp_rank = parallel_state.get_data_parallel_rank()
-                    dp_size = parallel_state.get_data_parallel_world_size()
-                    last_stage_word_emb_replica_id = dp_rank + dp_size  # copy of first stage embedding
-
-                    sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
-                        tensor=tensor,
-                        key=first_stage_word_emb_key,
-                        replica_id=last_stage_word_emb_replica_id,
-                        allow_shape_mismatch=True,
-                    )
-
-                    sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
-
-            else:
-                output_layer_state_dict = self.output_layer.state_dict(prefix=output_layer_prefix, keep_vars=True)
-                output_layer_tensor = output_layer_state_dict[output_layer_key]
-                # independent output layer
-                sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
-                    tensor=output_layer_tensor,
-                    key=output_layer_key,
-                    replica_id=parallel_state.get_data_parallel_rank(),
-                    allow_shape_mismatch=True,
-                )
-
-                sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
-
-        return sharded_state_dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 4906442c5426..debcc2555d46 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -31,25 +31,26 @@
 from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
 
 # Use this spec for an implementation using modules in TE
-falcon_layer_spec = ModuleSpec(
-    module=FalconTransformerLayer,
-    submodules=FalconTransformerLayerSubmodules(
-        input_layernorm=TENorm,
-        self_attention=ModuleSpec(
-            module=SelfAttention,
-            params={"attn_mask_type": AttnMaskType.causal},
-            submodules=SelfAttentionSubmodules(
-                linear_qkv=TEColumnParallelLinear,
-                dot_product_attention=TEDotProductAttention,
-                linear_proj=TERowParallelLinear,
+def get_gpt_layer_with_transformer_engine_spec() -> ModuleSpec:
+    return ModuleSpec(
+        module=FalconTransformerLayer,
+        submodules=FalconTransformerLayerSubmodules(
+            input_layernorm=TENorm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
             ),
+            self_attn_bda=get_bias_dropout_add,
+            post_self_attn_layernorm=TENorm,
+            pre_mlp_layernorm=TENorm,
+            mlp=ModuleSpec(
+                module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
+            ),
+            mlp_bda=get_bias_dropout_add,
         ),
-        self_attn_bda=get_bias_dropout_add,
-        post_self_attn_layernorm=TENorm,
-        pre_mlp_layernorm=TENorm,
-        mlp=ModuleSpec(
-            module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
-        ),
-        mlp_bda=get_bias_dropout_add,
-    ),
-)
+    )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index d3297b2efd8a..2b3a41fe6a64 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -76,7 +76,6 @@
 try:
     from megatron.core import InferenceParams, parallel_state
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
-    from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.spec_utils import import_module
@@ -104,28 +103,22 @@
     HAVE_TE = False
 
 
-def import_falcon_gpt_model():
-    """Conditionally import FalconGPTModel.
-    """
-    try:
-        # from megatron.core.models.falcon.falcon_gpt_model import FalconGPTModel
-        from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_gpt_model import FalconGPTModel
-        from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import falcon_layer_spec
-
-        return FalconGPTModel, falcon_layer_spec
-    except (ImportError, ModuleNotFoundError):
-        raise ImportError("Failed to import FalconGPTModel. Please ensure the necessary dependencies are installed.")
-
-
-@dataclass
-class FalconTransformerConfig(TransformerConfig):
-    """
-    Transformer Config for Falcon Variants
-    """
-
-    new_decoder_architecture: bool = False
-    parallel_attention: bool = False
+def get_specs(spec_name, spec_func='get_gpt_layer_with_transformer_engine_spec'): #Assumes the default spec function name
+    import importlib.util
+    name_spec_dict = {
+        "": "megatron.core.models.gpt.gpt_layer_specs", #default GPT
+        "megatron_falcon_gpt": "nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec" #Other customized model spec locations
+    }
+    module_path = name_spec_dict.get(spec_name)
+    if not module_path:
+        raise ImportError(f"Failed to import {spec_name}, please ensure {spec_name} is supported.")
 
+    module = importlib.import_module(module_path)
+    try:
+        spec = getattr(module, spec_func)()
+    except AttributeError:
+        raise ImportError(f"Module {module_path} does not have {spec_func}")
+    return spec
 
 class MegatronGPTExportableModel(torch.nn.Module, Exportable):
     """
@@ -239,10 +232,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)
 
         self.mcore_gpt = cfg.get('mcore_gpt', False)
-        # Falcon specific args
-        self.falcon_name = cfg.get('name', 'megatron_falcon_gpt')
-        self.new_decoder_architecture = cfg.get('new_decoder_architecture', False)
-        self.parallel_attention = cfg.get('parallel_attention', False)
+        self.spec_name = cfg.get('name', '')
 
         self.rampup_batch_size = self.cfg.get('rampup_batch_size', None)
         if self.rampup_batch_size:
@@ -333,28 +323,10 @@ def get_inference_config(self):
 
     def model_provider_func(self, pre_process, post_process):
         """Model depends on pipeline paralellism."""
-        if self.mcore_gpt and self.falcon_name:
-            FalconGPTModel, falcon_layer_spec = import_falcon_gpt_model()
-            transformer_layer_spec = falcon_layer_spec
-            model = FalconGPTModel(
-                config=self.transformer_config,
-                transformer_layer_spec=transformer_layer_spec,
-                vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
-                max_sequence_length=self.cfg.get('encoder_seq_length', 512),
-                pre_process=pre_process,
-                post_process=post_process,
-                parallel_output=True,
-                share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True),
-                position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'),
-                rotary_percent=self.cfg.get('rotary_percentage', 1.0),
-                seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
-            )
-
-        elif self.mcore_gpt:
-            transformer_layer_spec = gpt_layer_with_transformer_engine_spec
+        if self.mcore_gpt:
             model = MCoreGPTModel(
                 config=self.transformer_config,
-                transformer_layer_spec=transformer_layer_spec,
+                transformer_layer_spec=get_specs(self.spec_name),
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
                 max_sequence_length=self.cfg.get('encoder_seq_length', 512),
                 pre_process=pre_process,
@@ -1563,10 +1535,6 @@ def build_transformer_config(self) -> TransformerConfig:
         gated_linear_unit = activation.endswith('glu')
         activation_func = activation_to_func(activation)
 
-        mcore_gpt = self.cfg.get('mcore_gpt', False)
-        new_decoder_architecture = self.cfg.get('new_decoder_architecture', False)
-        parallel_attention = self.cfg.get('parallel_attention', False)
-
         normalization = self.cfg.get('normalization', 'layernorm')
         layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
         if normalization == 'layernorm':
@@ -1653,14 +1621,12 @@ def build_transformer_config(self) -> TransformerConfig:
                     f"Add this key to cfg or config_mapping to make to make it configurable."
                 )
 
-        if mcore_gpt and (new_decoder_architecture or parallel_attention):
-            transformer_config = FalconTransformerConfig(
-                **transformer_config_dict,
-                new_decoder_architecture=new_decoder_architecture,
-                parallel_attention=parallel_attention,
-            )
-        else:
-            transformer_config = TransformerConfig(**transformer_config_dict)
+        transformer_config = TransformerConfig(**transformer_config_dict)
+
+        #pass mcore customization configs directly to mcore
+        mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {})
+        for key,value in mcore_customization_config_dict.items():
+            setattr(transformer_config, key, value)
 
         return transformer_config
 
diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index 275955d5a056..fdce82d03023 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -175,13 +175,17 @@ def load_falcon_config(args) -> FalconConfig:
         "position_embedding_type": "rope",
         "precision": args.precision,
         "init_method_std": falcon_config.initializer_range,
-        "new_decoder_architecture": falcon_config.new_decoder_architecture,
-        "parallel_attention": falcon_config.parallel_attn,
         "activation": "gelu",
         "bias_activation_fusion": False,
         "bias_dropout_add_fusion": False,
         "seq_len_interpolation_factor": None,
     }
+
+    mcore_customization_config_dict={
+        "new_decoder_architecture": falcon_config.new_decoder_architecture,
+        "parallel_attention": falcon_config.parallel_attn,
+    }
+
     tokenizer_dict = {
         "library": "huggingface",
         "type": args.input,
@@ -239,6 +243,7 @@ def load_falcon_config(args) -> FalconConfig:
     model_dict.update(override_model_dict)
     model_dict["tokenizer"] = tokenizer_dict
     model_dict["name"] = 'megatron_falcon_gpt'
+    model_dict["mcore_customization_config"] = mcore_customization_config_dict
 
     omega_cfg = OmegaConf.create(model_dict)
 

From f1860334c3f8e45967f7c41a1fc796134f5aedb3 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Thu, 2 Nov 2023 22:50:32 -0700
Subject: [PATCH 30/69] modification to get nemo run with mcore in this version

---
 nemo/collections/nlp/parts/nlp_overrides.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 8b2e06b4eb0c..1693cd23993a 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -195,7 +195,6 @@ def init_model_parallel(self, global_rank: int, world_size: int) -> None:
                     pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
                     virtual_pipeline_model_parallel_size=app_state.virtual_pipeline_model_parallel_size,
                     pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
-                    use_fp8=app_state.use_fp8,
                 )
 
                 # assert that fake tp and pp rank match after model parallel init

From 1c846b8bd2ff77f98ae73fbdf57a371d2bf2993c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 16 Nov 2023 19:48:47 +0000
Subject: [PATCH 31/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../models/language_modeling/megatron/__init__.py  |  2 +-
 .../megatron/falcon/falcon_decoder_layer.py        |  3 +--
 .../megatron/falcon/falcon_spec.py                 |  3 ++-
 .../models/language_modeling/megatron_gpt_model.py | 14 +++++++++-----
 .../convert_hf_falcon_to_nemo.py                   |  2 +-
 5 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
index 70b6d4c169b8..e9a6714729c9 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
@@ -15,8 +15,8 @@
 # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel
 
 try:
-    from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
     from nemo.collections.nlp.models.language_modeling.megatron.falcon import falcon_spec
+    from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 
     HAVE_MEGATRON_CORE = True
 except (ImportError, ModuleNotFoundError):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 037305dd6824..c4998b667081 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -16,14 +16,13 @@
 from typing import Union
 
 import torch
-
 from megatron.core import parallel_state
 from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.transformer.attention import SelfAttentionSubmodules
+from megatron.core.transformer.base_layer import BaseLayer, LayerSubmodules
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
-from megatron.core.transformer.base_layer import LayerSubmodules, BaseLayer
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import make_viewless_tensor
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index debcc2555d46..551f4e65bdfb 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -49,7 +49,8 @@ def get_gpt_layer_with_transformer_engine_spec() -> ModuleSpec:
             post_self_attn_layernorm=TENorm,
             pre_mlp_layernorm=TENorm,
             mlp=ModuleSpec(
-                module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
+                module=MLP,
+                submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
             ),
             mlp_bda=get_bias_dropout_add,
         ),
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 8b3260786641..46e8f0a3c4e7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -103,11 +103,14 @@
     HAVE_TE = False
 
 
-def get_specs(spec_name, spec_func='get_gpt_layer_with_transformer_engine_spec'): #Assumes the default spec function name
+def get_specs(
+    spec_name, spec_func='get_gpt_layer_with_transformer_engine_spec'
+):  # Assumes the default spec function name
     import importlib.util
+
     name_spec_dict = {
-        "": "megatron.core.models.gpt.gpt_layer_specs", #default GPT
-        "megatron_falcon_gpt": "nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec" #Other customized model spec locations
+        "": "megatron.core.models.gpt.gpt_layer_specs",  # default GPT
+        "megatron_falcon_gpt": "nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec",  # Other customized model spec locations
     }
     module_path = name_spec_dict.get(spec_name)
     if not module_path:
@@ -120,6 +123,7 @@ def get_specs(spec_name, spec_func='get_gpt_layer_with_transformer_engine_spec')
         raise ImportError(f"Module {module_path} does not have {spec_func}")
     return spec
 
+
 class MegatronGPTExportableModel(torch.nn.Module, Exportable):
     """
     Megatron GPT Wrapper for ONNX export
@@ -1623,9 +1627,9 @@ def build_transformer_config(self) -> TransformerConfig:
 
         transformer_config = TransformerConfig(**transformer_config_dict)
 
-        #pass mcore customization configs directly to mcore
+        # pass mcore customization configs directly to mcore
         mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {})
-        for key,value in mcore_customization_config_dict.items():
+        for key, value in mcore_customization_config_dict.items():
             setattr(transformer_config, key, value)
 
         return transformer_config
diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index fdce82d03023..d93e925c9f71 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -181,7 +181,7 @@ def load_falcon_config(args) -> FalconConfig:
         "seq_len_interpolation_factor": None,
     }
 
-    mcore_customization_config_dict={
+    mcore_customization_config_dict = {
         "new_decoder_architecture": falcon_config.new_decoder_architecture,
         "parallel_attention": falcon_config.parallel_attn,
     }

From 42ff405844233718ba72f53b21195bd9c3383f69 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Thu, 16 Nov 2023 20:21:59 +0000
Subject: [PATCH 32/69] small fix on the output file path

---
 .../convert_hf_falcon_to_nemo.py                     | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index d93e925c9f71..9c0a2d994229 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -122,9 +122,9 @@ def load_falcon_config(args) -> FalconConfig:
         "--input",
         type=str,
         required=True,
-        help="Falcon variants from HuggingFace hub or local dir with downloaded model",
+        help="Path to Falcon variants checkpoint from HuggingFace hub or local dir",
     )
-    parser.add_argument("--output", type=str, default=".", help="Path to dir where to store output .nemo file")
+    parser.add_argument("--output", type=str, default="None", required=True, help="Path to dir where to store output .nemo file")
     parser.add_argument(
         "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
     )
@@ -132,9 +132,6 @@ def load_falcon_config(args) -> FalconConfig:
 
     args = parser.parse_args()
 
-    if not os.path.isdir(args.output):
-        raise FileNotFoundError(f"Output directory '{args.output}' does not exist")
-
     falcon_config = load_falcon_config(args)
     with open(args.config, "r", encoding="utf_8") as f:
         orig_cfg = yaml.safe_load(f)
@@ -288,9 +285,8 @@ def load_falcon_config(args) -> FalconConfig:
     dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32
     model = model.to(dtype=dtype)
     model.cfg.update(use_cpu_initialization=False)
-    tokenizer_name_part = model.cfg.tokenizer["type"].split("/")[1]
-    model.save_to(os.path.join(args.output, f'falcon_{tokenizer_name_part}_{args.precision}_tp1_pp1.nemo'))
-    logging.info("Done.")
+    model.save_to(args.output)
+    logging.info(f'Done. NeMo model saved to: {args.output}')
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
     logging.info(f'nemo model created and saved. Total time: {t}')

From 0fea44739e35c72827ed54669481cecb9ea399bc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 16 Nov 2023 20:28:15 +0000
Subject: [PATCH 33/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index 9c0a2d994229..39f194ecaf8d 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -124,7 +124,9 @@ def load_falcon_config(args) -> FalconConfig:
         required=True,
         help="Path to Falcon variants checkpoint from HuggingFace hub or local dir",
     )
-    parser.add_argument("--output", type=str, default="None", required=True, help="Path to dir where to store output .nemo file")
+    parser.add_argument(
+        "--output", type=str, default="None", required=True, help="Path to dir where to store output .nemo file"
+    )
     parser.add_argument(
         "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
     )

From 39b78a9ad4bd985065f705f0e1158dc2708378a7 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Thu, 16 Nov 2023 23:40:20 +0000
Subject: [PATCH 34/69] add nemo to hf conversion script

---
 .../convert_nemo_falcon_to_hf.py              | 171 ++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py

diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
new file mode 100644
index 000000000000..338f02b7be1b
--- /dev/null
+++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from pytorch_lightning import Trainer
+from transformers import AutoModelForCausalLM
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils import logging
+
+"""
+Script to convert a falcon checkpoint in nemo (mcore path) into a HuggingFace checkpoint.
+This script can be used to 1) generate only the HF weights, or 2) generate an entire HF model folder.
+
+1) Generate only HF weights from a nemo file:
+
+    python convert_nemo_falcon_to_hf.py \
+    --in-file /path/to/file.nemo or /path/to/extracted_folder \
+    --out-file /path/to/pytorch_model.bin
+    
+2) Generate the full HF model folder
+
+    python convert_nemo_falcon_to_hf.py \
+    --in-file /path/to/file.nemo or /path/to/extracted_folder \
+    --out-file /path/to/pytorch_model.bin \
+    --hf-in-file /path/to/input_hf_folder \
+    --hf-out-file /path/to/output_hf_folder
+
+    Use the --cpu-only flag if the model cannot fit in the GPU (e.g. falcon 180b). 
+    However this option makes the conversion script significantly slower.
+"""
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--in-file", type=str, default=None, required=True, help="Path to .nemo file",
+    )
+    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to HF .bin file")
+    parser.add_argument(
+        "--hf-in-path",
+        type=str,
+        default=None,
+        help="A HF model path, " "e.g. a folder containing https://huggingface.co/meta-falcon/falcon-2-7b-hf/tree/main",
+    )
+    parser.add_argument(
+        "--hf-out-path",
+        type=str,
+        default=None,
+        help="Output HF model path, " "with the same format as above but user's own weights",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default=None,
+        help="Precision of output weights."
+        "Defaults to precision of the input nemo weights (model.cfg.trainer.precision)",
+    )
+    parser.add_argument(
+        "--cpu-only",
+        action="store_true",
+        help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
+        "but this option makes the conversion script significantly slower.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None:
+    """
+    Convert NeMo weights to HF weights
+    """
+    dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy())
+    if cpu_only:
+        map_location = torch.device('cpu')
+        model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
+        model_config.use_cpu_initialization = True
+    else:
+        map_location, model_config = None, None
+
+    if cpu_only:
+        logging.info("******** Loading model on CPU. This will take a significant amount of time.")
+    model = MegatronGPTModel.restore_from(
+        input_nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
+    )
+    if precision is None:
+        precision = model.cfg.precision
+    if precision in [32, "32"]:
+        dtype = torch.float32
+    elif precision in [16, "16", "16-mixed"]:
+        dtype = torch.float16
+    elif precision in ["bf16", "bf16-mixed"]:
+        dtype = torch.bfloat16
+    else:
+        logging.warning(f"Precision string {precision} is not recognized, falling back to fp32")
+        dtype = torch.float32  # fallback
+
+    param_to_weights = lambda param: param.to(dtype)
+    checkpoint = OrderedDict()
+    checkpoint['state_dict'] = OrderedDict()
+
+    def get_original_key(new_key):
+        new_key = new_key[len(prefix):]
+
+        if new_key.startswith("embedding.word_embeddings.weight"):
+            return "transformer.word_embeddings.weight"
+        elif new_key.startswith("decoder.final_layernorm"):
+            return new_key.replace("decoder.final_layernorm", "transformer.ln_f")
+        elif new_key.startswith("output_layer"):
+            return new_key.replace("output_layer", "lm_head")
+        
+        key = new_key.replace("decoder.layers", "transformer.h")
+
+        if model.cfg.new_decoder_architecture:
+            key = key.replace("input_layernorm", "ln_attn")
+            key = key.replace("pre_mlp_layernorm", "ln_mlp")
+        else:
+            key = key.replace("input_layernorm", "input_layernorm")
+            if not model.cfg.parallel_attention:
+                key = key.replace("post_self_attn_layernorm", "post_attention_layernorm")
+
+        key = key.replace("self_attention.linear_proj", "self_attention.dense")
+        key = key.replace("self_attention.linear_qkv", "self_attention.query_key_value")
+        key = key.replace("linear_fc1", "dense_h_to_4h")
+        key = key.replace("linear_fc2", "dense_4h_to_h")
+        return key
+    
+    prefix = 'model.module.' if any(k.startswith('model.module.') for k in model.state_dict()) else 'model.'
+    
+    for key, value in model.state_dict().items():
+        orig_key = get_original_key(key)
+        checkpoint['state_dict'][orig_key] = param_to_weights(value)
+        
+    os.makedirs(os.path.dirname(output_hf_file), exist_ok=True)
+    torch.save(checkpoint, output_hf_file)
+    logging.info(f"Weights reverted and saved to {output_hf_file}")
+
+
+def replace_hf_weights(weights_file, input_hf_path, output_hf_path):
+    model = AutoModelForCausalLM.from_pretrained(input_hf_path, local_files_only=True)
+    nemo_exported = torch.load(weights_file)
+
+    model.load_state_dict(nemo_exported['state_dict'])
+    model.save_pretrained(output_hf_path)
+    logging.info(f"Full HF model saved to {output_hf_path}")
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args.in_file, args.out_file, precision=args.precision, cpu_only=args.cpu_only)
+    if args.hf_in_path and args.hf_out_path:
+        replace_hf_weights(args.out_file, args.hf_in_path, args.hf_out_path)
+    else:
+        logging.info("`hf-in-path` and/or `hf-out-path` not provided, not generating full HF model.")
+        logging.info(f".bin file is saved to {args.out_file}")

From c85f3ac4d8427a515f1168aedb8624dca2b2709e Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Fri, 17 Nov 2023 18:43:11 +0000
Subject: [PATCH 35/69] fix on base layer config and missing state dict due to
 dist ckpt

---
 scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
index 338f02b7be1b..923079f239f5 100644
--- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
+++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
@@ -127,12 +127,12 @@ def get_original_key(new_key):
         
         key = new_key.replace("decoder.layers", "transformer.h")
 
-        if model.cfg.new_decoder_architecture:
+        if model.cfg.mcore_customization_config.new_decoder_architecture:
             key = key.replace("input_layernorm", "ln_attn")
             key = key.replace("pre_mlp_layernorm", "ln_mlp")
         else:
             key = key.replace("input_layernorm", "input_layernorm")
-            if not model.cfg.parallel_attention:
+            if not model.cfg.mcore_customization_config.parallel_attention:
                 key = key.replace("post_self_attn_layernorm", "post_attention_layernorm")
 
         key = key.replace("self_attention.linear_proj", "self_attention.dense")
@@ -144,7 +144,10 @@ def get_original_key(new_key):
     prefix = 'model.module.' if any(k.startswith('model.module.') for k in model.state_dict()) else 'model.'
     
     for key, value in model.state_dict().items():
+        if '_extra_state' in key:
+            continue
         orig_key = get_original_key(key)
+        print(f'Converting {key} to {orig_key}')
         checkpoint['state_dict'][orig_key] = param_to_weights(value)
         
     os.makedirs(os.path.dirname(output_hf_file), exist_ok=True)

From b0c1bb73ee7b452055f840f18db167e2b4a346f3 Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Fri, 17 Nov 2023 18:47:17 +0000
Subject: [PATCH 36/69] Revert "fix on base layer config and missing state dict
 due to dist ckpt"

This reverts commit c85f3ac4d8427a515f1168aedb8624dca2b2709e.
---
 scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
index 923079f239f5..338f02b7be1b 100644
--- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
+++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
@@ -127,12 +127,12 @@ def get_original_key(new_key):
         
         key = new_key.replace("decoder.layers", "transformer.h")
 
-        if model.cfg.mcore_customization_config.new_decoder_architecture:
+        if model.cfg.new_decoder_architecture:
             key = key.replace("input_layernorm", "ln_attn")
             key = key.replace("pre_mlp_layernorm", "ln_mlp")
         else:
             key = key.replace("input_layernorm", "input_layernorm")
-            if not model.cfg.mcore_customization_config.parallel_attention:
+            if not model.cfg.parallel_attention:
                 key = key.replace("post_self_attn_layernorm", "post_attention_layernorm")
 
         key = key.replace("self_attention.linear_proj", "self_attention.dense")
@@ -144,10 +144,7 @@ def get_original_key(new_key):
     prefix = 'model.module.' if any(k.startswith('model.module.') for k in model.state_dict()) else 'model.'
     
     for key, value in model.state_dict().items():
-        if '_extra_state' in key:
-            continue
         orig_key = get_original_key(key)
-        print(f'Converting {key} to {orig_key}')
         checkpoint['state_dict'][orig_key] = param_to_weights(value)
         
     os.makedirs(os.path.dirname(output_hf_file), exist_ok=True)

From ce1bf4a87d977d3c860a9cedf753ccc5ab029dbd Mon Sep 17 00:00:00 2001
From: Vivian <xuanzic@nvidia.com>
Date: Fri, 17 Nov 2023 18:49:20 +0000
Subject: [PATCH 37/69] fix on base layer config and missing state dict due to
 dist ckpt

---
 scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
index 338f02b7be1b..3042baeda527 100644
--- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
+++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
@@ -127,12 +127,12 @@ def get_original_key(new_key):
         
         key = new_key.replace("decoder.layers", "transformer.h")
 
-        if model.cfg.new_decoder_architecture:
+        if model.cfg.mcore_customization_config.new_decoder_architecture:
             key = key.replace("input_layernorm", "ln_attn")
             key = key.replace("pre_mlp_layernorm", "ln_mlp")
         else:
             key = key.replace("input_layernorm", "input_layernorm")
-            if not model.cfg.parallel_attention:
+            if not model.cfg.mcore_customization_config.parallel_attention:
                 key = key.replace("post_self_attn_layernorm", "post_attention_layernorm")
 
         key = key.replace("self_attention.linear_proj", "self_attention.dense")
@@ -144,6 +144,8 @@ def get_original_key(new_key):
     prefix = 'model.module.' if any(k.startswith('model.module.') for k in model.state_dict()) else 'model.'
     
     for key, value in model.state_dict().items():
+        if '_extra_state' in key:
+            continue
         orig_key = get_original_key(key)
         checkpoint['state_dict'][orig_key] = param_to_weights(value)
         

From 6351ae9c51fae3f22b43d2ff17bb24efa4505fdc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 29 Nov 2023 19:24:17 +0000
Subject: [PATCH 38/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../convert_nemo_falcon_to_hf.py                    | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
index 3042baeda527..ac678a8676bf 100644
--- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
+++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
@@ -57,7 +57,8 @@ def get_args():
         "--hf-in-path",
         type=str,
         default=None,
-        help="A HF model path, " "e.g. a folder containing https://huggingface.co/meta-falcon/falcon-2-7b-hf/tree/main",
+        help="A HF model path, "
+        "e.g. a folder containing https://huggingface.co/meta-falcon/falcon-2-7b-hf/tree/main",
     )
     parser.add_argument(
         "--hf-out-path",
@@ -116,7 +117,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
     checkpoint['state_dict'] = OrderedDict()
 
     def get_original_key(new_key):
-        new_key = new_key[len(prefix):]
+        new_key = new_key[len(prefix) :]
 
         if new_key.startswith("embedding.word_embeddings.weight"):
             return "transformer.word_embeddings.weight"
@@ -124,7 +125,7 @@ def get_original_key(new_key):
             return new_key.replace("decoder.final_layernorm", "transformer.ln_f")
         elif new_key.startswith("output_layer"):
             return new_key.replace("output_layer", "lm_head")
-        
+
         key = new_key.replace("decoder.layers", "transformer.h")
 
         if model.cfg.mcore_customization_config.new_decoder_architecture:
@@ -140,15 +141,15 @@ def get_original_key(new_key):
         key = key.replace("linear_fc1", "dense_h_to_4h")
         key = key.replace("linear_fc2", "dense_4h_to_h")
         return key
-    
+
     prefix = 'model.module.' if any(k.startswith('model.module.') for k in model.state_dict()) else 'model.'
-    
+
     for key, value in model.state_dict().items():
         if '_extra_state' in key:
             continue
         orig_key = get_original_key(key)
         checkpoint['state_dict'][orig_key] = param_to_weights(value)
-        
+
     os.makedirs(os.path.dirname(output_hf_file), exist_ok=True)
     torch.save(checkpoint, output_hf_file)
     logging.info(f"Weights reverted and saved to {output_hf_file}")

From a6c1fae27e0af42341255ea50022a40b8313e0e2 Mon Sep 17 00:00:00 2001
From: Vivian chen <xuanzic@nvidia.com>
Date: Wed, 29 Nov 2023 19:32:13 +0000
Subject: [PATCH 39/69] fix megatron_gpt_model

Signed-off-by: Vivian chen <xuanzic@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py        | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index b110a306c56c..13472532b168 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -331,11 +331,7 @@ def model_provider_func(self, pre_process, post_process):
         if self.mcore_gpt:
             model = MCoreGPTModel(
                 config=self.transformer_config,
-<<<<<<< HEAD
-                transformer_layer_spec=get_specs(self.spec_name),
-=======
                 transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
->>>>>>> main
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
                 max_sequence_length=self.cfg.get('encoder_seq_length', 512),
                 pre_process=pre_process,

From 1156b6af234a48e366e13c0e8ed9a7c27e9ef678 Mon Sep 17 00:00:00 2001
From: Vivian chen <xuanzic@nvidia.com>
Date: Wed, 29 Nov 2023 20:00:02 +0000
Subject: [PATCH 40/69] modify model config

Signed-off-by: Vivian chen <xuanzic@nvidia.com>
---
 .../language_modeling/conf/megatron_falcon_config.yaml   | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
index b9a38aa5b952..143a75f3fc21 100644
--- a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
@@ -84,8 +84,6 @@ model:
   overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   num_query_groups: 1 # Number of query groups for group query attention. If None, normal attention is used. | 7b: 1 | 40b: 8 | 180b: 8
-  new_decoder_architecture: false
-  parallel_attention: true
   
   tokenizer:
     library: 'huggingface'
@@ -215,3 +213,10 @@ model:
       warmup_steps: 500
       constant_steps: 50000
       min_lr: 2e-5
+gc_interval: 0
+precision: bf16
+mcore_customization_config:
+  new_decoder_architecture: false
+  parallel_attention: true
+target: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel
+nemo_version: 1.21.0rc0
\ No newline at end of file

From 5034dfc63b30e22dacf939fbac00afe1ad9d8dcf Mon Sep 17 00:00:00 2001
From: Vivian Chen <140748220+xuanzic@users.noreply.github.com>
Date: Thu, 30 Nov 2023 21:42:30 -0800
Subject: [PATCH 41/69] Apply suggestions from code review

Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Vivian Chen <140748220+xuanzic@users.noreply.github.com>
---
 examples/nlp/language_modeling/conf/megatron_falcon_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
index 143a75f3fc21..6d45db3f7d80 100644
--- a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
@@ -37,7 +37,7 @@ exp_manager:
     mode: min
     always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
     save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
-    filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
+    filename: 'megatron_falcon--{val_loss:.2f}-{step}-{consumed_samples}'
     model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
 
 model:

From c97d38c8b2dc05ef29adba00f14f885dd69f6936 Mon Sep 17 00:00:00 2001
From: Vivian Chen <xuanzic@nvidia.com>
Date: Fri, 1 Dec 2023 06:10:46 +0000
Subject: [PATCH 42/69] fix based on review

Signed-off-by: Vivian Chen <xuanzic@nvidia.com>
---
 .../conf/megatron_falcon_config.yaml          | 17 +++++-------
 .../conf/megatron_falcon_inference.yaml       |  7 +++--
 .../language_modeling/megatron/__init__.py    |  1 -
 .../megatron/falcon/falcon_decoder_layer.py   | 27 +++----------------
 .../language_modeling/megatron_gpt_model.py   |  2 +-
 5 files changed, 15 insertions(+), 39 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
index 6d45db3f7d80..4b8009256a9e 100644
--- a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
@@ -84,6 +84,11 @@ model:
   overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   num_query_groups: 1 # Number of query groups for group query attention. If None, normal attention is used. | 7b: 1 | 40b: 8 | 180b: 8
+  gc_interval: 0
+  precision: bf16
+  mcore_customization_config:
+    new_decoder_architecture: false
+    parallel_attention: true
   
   tokenizer:
     library: 'huggingface'
@@ -153,7 +158,6 @@ model:
   sequence_parallel: False
 
   ## Transformer Engine
-  transformer_engine: True
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 
   fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
@@ -202,7 +206,7 @@ model:
     gen_shape: False # Generate model and kernel details including input shapes
   
   optim:
-    name: fused_adam
+    name: distributed_fused_adam
     lr: 2e-4
     weight_decay: 0.01 
     betas: 
@@ -212,11 +216,4 @@ model:
       name: CosineAnnealing
       warmup_steps: 500
       constant_steps: 50000
-      min_lr: 2e-5
-gc_interval: 0
-precision: bf16
-mcore_customization_config:
-  new_decoder_architecture: false
-  parallel_attention: true
-target: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel
-nemo_version: 1.21.0rc0
\ No newline at end of file
+      min_lr: 2e-5
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml
index 298b6a702571..1ccc9ed5dff8 100644
--- a/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml
@@ -19,10 +19,9 @@ trainer:
   precision: bf16 # 16, 32, or bf16
   use_distributed_sampler: False
 
-tensor_model_parallel_size: -1
-pipeline_model_parallel_size: -1
-pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
-megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
+megatron_amp_O2: True  # Enable O2-level automatic mixed precision to save memory
 gpt_model_file: null  # GPT nemo file path
 checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
 checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
index e9a6714729c9..3afb1e3fae48 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py
@@ -15,7 +15,6 @@
 # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel
 
 try:
-    from nemo.collections.nlp.models.language_modeling.megatron.falcon import falcon_spec
     from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 
     HAVE_MEGATRON_CORE = True
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index c4998b667081..63b32bc70d3f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -89,7 +89,6 @@ def __init__(
             self.parallel_attention = None
 
         ## [Module 1: Input Layernorm] Optional Layernorm on the input data
-        # TODO: add pytorch only layernorm
         self.input_layernorm = build_module(
             submodules.input_layernorm,
             config=self.config,
@@ -123,8 +122,8 @@ def __init__(
             )
 
         ## [Module 5: pre mlp layernorm] Optional Layernorm before MLP, used in Falcon's new decoder architecture
-        self.pre_mlp_layernorm = (
-            build_module(
+        if self.new_decoder_architecture:
+            self.pre_mlp_layernorm = build_module(
                 submodules.pre_mlp_layernorm,
                 config=self.config,
                 hidden_size=self.config.hidden_size,
@@ -134,9 +133,8 @@ def __init__(
                 zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
                 normalization=self.config.normalization,
             )
-            if self.new_decoder_architecture
-            else None
-        )
+        else:
+            self.pre_mlp_layernorm = None
 
         ## [Module 6: MLP block]
         self.mlp = build_module(submodules.mlp, config=self.config)
@@ -144,12 +142,6 @@ def __init__(
         ## [Module 7: BiasDropoutFusion] Optional
         self.mlp_bda = build_module(submodules.mlp_bda)
 
-        # @jcasper how should we handle nvfuser?
-        # Set bias+dropout+add fusion grad_enable execution handler.
-        # TORCH_MAJOR = int(torch.__version__.split('.')[0])
-        # TORCH_MINOR = int(torch.__version__.split('.')[1])
-        # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
-        # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
         self.bias_dropout_add_exec_handler = torch.enable_grad
 
     def forward(
@@ -182,8 +174,6 @@ def forward(
             rotary_pos_emb=rotary_pos_emb,
         )
 
-        # TODO: could we move `bias_dropout_add_exec_handler` itself
-        # inside the module provided in the `bias_dropout_add_spec` module?
         with self.bias_dropout_add_exec_handler():
             hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)(
                 attention_output_with_bias, residual, self.config.hidden_dropout
@@ -208,26 +198,17 @@ def forward(
             mlp_output_without_bias = mlp_output + attn_output
             mlp_output_with_bias = (mlp_output_without_bias, None)
 
-        # TODO: could we move `bias_dropout_add_exec_handler` itself
-        # inside the module provided in the `bias_dropout_add_spec` module?
         with self.bias_dropout_add_exec_handler():
             hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)(
                 mlp_output_with_bias, residual, self.config.hidden_dropout
             )
 
-        # Jit compiled function creates 'view' tensor. This tensor
-        # potentially gets saved in the MPU checkpoint function context,
-        # which rejects view tensors. While making a viewless tensor here
-        # won't result in memory savings (like the data loader, or
-        # p2p_communication), it serves to document the origin of this
-        # 'view' tensor.
         output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
 
         return output, context
 
     def sharded_state_dict(self, prefix=''):
 
-        # state_dict = self.state_dict(prefix=prefix, keep_vars=True)
         state_dict = self.state_dict(keep_vars=True)
 
         tensor_parallel_layers_axis_map = {
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 13472532b168..0cf374336d9a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -16,7 +16,7 @@
 import os
 import queue
 import warnings
-from dataclasses import dataclass, fields
+from dataclasses import fields
 from functools import partial
 from typing import Any, Dict, Iterator, List, Optional, Union
 

From 4383bd17386b789ed814d7655fcb9c8800bbf358 Mon Sep 17 00:00:00 2001
From: Vivian Chen <xuanzic@nvidia.com>
Date: Sat, 2 Dec 2023 01:09:59 +0000
Subject: [PATCH 43/69] multiple revise based on review and latest mcore
 changes

---
 .../megatron/falcon/falcon_decoder_layer.py   | 12 ----------
 .../language_modeling/megatron_gpt_model.py   |  2 +-
 .../convert_hf_falcon_to_nemo.py              |  9 ++++---
 .../convert_nemo_falcon_to_hf.py              | 24 +++++++++----------
 4 files changed, 16 insertions(+), 31 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 63b32bc70d3f..fb0b0d9e0093 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -94,10 +94,6 @@ def __init__(
             config=self.config,
             hidden_size=self.config.hidden_size,
             eps=self.config.layernorm_epsilon,
-            persist_layer_norm=self.config.persist_layer_norm,
-            sequence_parallel=self.config.sequence_parallel,
-            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
-            normalization=self.config.normalization,
         )
 
         ## [Module 2: SelfAttention]
@@ -115,10 +111,6 @@ def __init__(
                 config=self.config,
                 hidden_size=self.config.hidden_size,
                 eps=self.config.layernorm_epsilon,
-                persist_layer_norm=self.config.persist_layer_norm,
-                sequence_parallel=self.config.sequence_parallel,
-                zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
-                normalization=self.config.normalization,
             )
 
         ## [Module 5: pre mlp layernorm] Optional Layernorm before MLP, used in Falcon's new decoder architecture
@@ -128,10 +120,6 @@ def __init__(
                 config=self.config,
                 hidden_size=self.config.hidden_size,
                 eps=self.config.layernorm_epsilon,
-                persist_layer_norm=self.config.persist_layer_norm,
-                sequence_parallel=self.config.sequence_parallel,
-                zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
-                normalization=self.config.normalization,
             )
         else:
             self.pre_mlp_layernorm = None
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 0cf374336d9a..46f70d248c84 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -331,7 +331,7 @@ def model_provider_func(self, pre_process, post_process):
         if self.mcore_gpt:
             model = MCoreGPTModel(
                 config=self.transformer_config,
-                transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
+                transformer_layer_spec=get_specs(self.spec_name),
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
                 max_sequence_length=self.cfg.get('encoder_seq_length', 512),
                 pre_process=pre_process,
diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index 39f194ecaf8d..f02b6dc3b336 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -21,15 +21,14 @@
 Example to run this conversion script:
 ```
     python convert_hf_falcon_to_nemo.py \
-     --config /path/to/megatron_gpt_config.yaml \
-     --input <path_to_hf_checkpoints_folder> \
-     --output <path_to_output_nemo_file> \
+     --config /path/to/megatron_falcon_config.yaml \
+     --input /path/to/hf/checkpoints/folder \
+     --output /path/to/output/nemo/file \
      --precision <precision of converted nemo model>
 ```
 """
 
 import argparse
-import os
 import time
 from typing import Dict
 
@@ -125,7 +124,7 @@ def load_falcon_config(args) -> FalconConfig:
         help="Path to Falcon variants checkpoint from HuggingFace hub or local dir",
     )
     parser.add_argument(
-        "--output", type=str, default="None", required=True, help="Path to dir where to store output .nemo file"
+        "--output", type=str, required=True, help="Path to dir where to store output .nemo file"
     )
     parser.add_argument(
         "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
index ac678a8676bf..bce63409a519 100644
--- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
+++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
@@ -21,6 +21,7 @@
 from transformers import AutoModelForCausalLM
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.utils import logging
 
@@ -50,9 +51,9 @@
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--in-file", type=str, default=None, required=True, help="Path to .nemo file",
+        "--in-file", type=str, required=True, help="Path to .nemo file",
     )
-    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to HF .bin file")
+    parser.add_argument("--out-file", type=str, required=True, help="Path to HF .bin file")
     parser.add_argument(
         "--hf-in-path",
         type=str,
@@ -92,6 +93,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
         map_location = torch.device('cpu')
         model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
         model_config.use_cpu_initialization = True
+        model_config.tensor_model_parallel_size = 1
     else:
         map_location, model_config = None, None
 
@@ -102,19 +104,15 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
     )
     if precision is None:
         precision = model.cfg.precision
-    if precision in [32, "32"]:
-        dtype = torch.float32
-    elif precision in [16, "16", "16-mixed"]:
-        dtype = torch.float16
-    elif precision in ["bf16", "bf16-mixed"]:
-        dtype = torch.bfloat16
-    else:
-        logging.warning(f"Precision string {precision} is not recognized, falling back to fp32")
+    try:
+        dtype = torch_dtype_from_precision(precision)
+    except ValueError as e:
+        # warning that {precision} is not supported, fallback to float32
+        logging.warning(str(e) + f", precision string '{precision}' is not recognized, falling back to fp32")
         dtype = torch.float32  # fallback
 
     param_to_weights = lambda param: param.to(dtype)
     checkpoint = OrderedDict()
-    checkpoint['state_dict'] = OrderedDict()
 
     def get_original_key(new_key):
         new_key = new_key[len(prefix) :]
@@ -148,7 +146,7 @@ def get_original_key(new_key):
         if '_extra_state' in key:
             continue
         orig_key = get_original_key(key)
-        checkpoint['state_dict'][orig_key] = param_to_weights(value)
+        checkpoint[orig_key] = param_to_weights(value)
 
     os.makedirs(os.path.dirname(output_hf_file), exist_ok=True)
     torch.save(checkpoint, output_hf_file)
@@ -159,7 +157,7 @@ def replace_hf_weights(weights_file, input_hf_path, output_hf_path):
     model = AutoModelForCausalLM.from_pretrained(input_hf_path, local_files_only=True)
     nemo_exported = torch.load(weights_file)
 
-    model.load_state_dict(nemo_exported['state_dict'])
+    model.load_state_dict(nemo_exported)
     model.save_pretrained(output_hf_path)
     logging.info(f"Full HF model saved to {output_hf_path}")
 

From b499c8805f18478d4d243f55b3e7aa94dfd12ca5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 2 Dec 2023 01:11:20 +0000
Subject: [PATCH 44/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py | 4 +---
 scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
index f02b6dc3b336..ef9410b1b929 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
@@ -123,9 +123,7 @@ def load_falcon_config(args) -> FalconConfig:
         required=True,
         help="Path to Falcon variants checkpoint from HuggingFace hub or local dir",
     )
-    parser.add_argument(
-        "--output", type=str, required=True, help="Path to dir where to store output .nemo file"
-    )
+    parser.add_argument("--output", type=str, required=True, help="Path to dir where to store output .nemo file")
     parser.add_argument(
         "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
     )
diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
index bce63409a519..cbe70064e272 100644
--- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
+++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
@@ -21,8 +21,8 @@
 from transformers import AutoModelForCausalLM
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 
 """

From 94f4ba25d664ba639151ceb86814dbd1ef6a9efb Mon Sep 17 00:00:00 2001
From: Vivian Chen <xuanzic@nvidia.com>
Date: Sat, 2 Dec 2023 01:16:40 +0000
Subject: [PATCH 45/69] fix

Signed-off-by: Vivian Chen <xuanzic@nvidia.com>
---
 scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
index cbe70064e272..66f6399855a3 100644
--- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
+++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
@@ -107,7 +107,6 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
     try:
         dtype = torch_dtype_from_precision(precision)
     except ValueError as e:
-        # warning that {precision} is not supported, fallback to float32
         logging.warning(str(e) + f", precision string '{precision}' is not recognized, falling back to fp32")
         dtype = torch.float32  # fallback
 

From 8928050323acc091a5131e7d01582680ed4a2451 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Sun, 3 Dec 2023 02:05:41 -0800
Subject: [PATCH 46/69] subclass from TransformerLayer

---
 .../megatron/falcon/falcon_decoder_layer.py   | 63 ++-----------------
 .../megatron/falcon/falcon_spec.py            |  6 +-
 2 files changed, 7 insertions(+), 62 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index fb0b0d9e0093..1e3c83f6f394 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -20,9 +20,8 @@
 from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.transformer.attention import SelfAttentionSubmodules
-from megatron.core.transformer.base_layer import BaseLayer, LayerSubmodules
 from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import make_viewless_tensor
@@ -42,21 +41,7 @@
         hyperparameters: transformer hyperparameters
 """
 
-
-@dataclass
-class FalconTransformerLayerSubmodules(LayerSubmodules):
-    input_layernorm: Union[ModuleSpec, type] = IdentityOp
-    self_attention: Union[ModuleSpec, type] = IdentityOp
-    self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp
-
-    post_self_attn_layernorm: Union[ModuleSpec, type] = IdentityOp
-
-    pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp
-    mlp: Union[ModuleSpec, type] = IdentityOp
-    mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp
-
-
-class FalconTransformerLayer(BaseLayer):
+class FalconTransformerLayer(TransformerLayer):
     """A single transformer layer.
 
     Transformer layer takes input with size [s, b, h] and returns an
@@ -67,53 +52,21 @@ class FalconTransformerLayer(BaseLayer):
     def __init__(
         self,
         config: TransformerConfig,  # should come from FalconTransformerConfig class
-        submodules: FalconTransformerLayerSubmodules,
+        submodules: TransformerLayerSubmodules,
         layer_number: int = 1,
         self_attn_mask_type=AttnMaskType.padding,
     ):
-        super().__init__(config=config, submodules=submodules)
-        self.config: TransformerConfig = config
-
-        self.layer_number = layer_number + self._get_layer_offset()
-
-        self.self_attn_mask_type = self_attn_mask_type
+        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
 
         if hasattr(self.config, 'new_decoder_architecture'):
             self.new_decoder_architecture = self.config.new_decoder_architecture
         else:
             self.new_decoder_architecture = None
-
         if hasattr(self.config, 'parallel_attention'):
             self.parallel_attention = self.config.parallel_attention
         else:
             self.parallel_attention = None
 
-        ## [Module 1: Input Layernorm] Optional Layernorm on the input data
-        self.input_layernorm = build_module(
-            submodules.input_layernorm,
-            config=self.config,
-            hidden_size=self.config.hidden_size,
-            eps=self.config.layernorm_epsilon,
-        )
-
-        ## [Module 2: SelfAttention]
-        self.self_attention = build_module(submodules.self_attention, config=self.config, layer_number=layer_number,)
-
-        ## [Module 3: BiasDropoutFusion] Optional
-        self.self_attn_bda = build_module(submodules.self_attn_bda)
-
-        ## [Module 4: Post SelfAttention] Optional Layernorm after self-attn
-        if self.new_decoder_architecture or self.parallel_attention:
-            self.post_self_attn_layernorm = None
-        else:
-            self.post_self_attn_layernorm = build_module(
-                submodules.post_self_attn_layernorm,
-                config=self.config,
-                hidden_size=self.config.hidden_size,
-                eps=self.config.layernorm_epsilon,
-            )
-
-        ## [Module 5: pre mlp layernorm] Optional Layernorm before MLP, used in Falcon's new decoder architecture
         if self.new_decoder_architecture:
             self.pre_mlp_layernorm = build_module(
                 submodules.pre_mlp_layernorm,
@@ -124,14 +77,6 @@ def __init__(
         else:
             self.pre_mlp_layernorm = None
 
-        ## [Module 6: MLP block]
-        self.mlp = build_module(submodules.mlp, config=self.config)
-
-        ## [Module 7: BiasDropoutFusion] Optional
-        self.mlp_bda = build_module(submodules.mlp_bda)
-
-        self.bias_dropout_add_exec_handler = torch.enable_grad
-
     def forward(
         self,
         hidden_states,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 551f4e65bdfb..e53092e86a6b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -28,13 +28,14 @@
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec
 
-from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules
+from megatron.core.transformer.transformer_layer import TransformerLayerSubmodules
+from .falcon_decoder_layer import FalconTransformerLayer
 
 # Use this spec for an implementation using modules in TE
 def get_gpt_layer_with_transformer_engine_spec() -> ModuleSpec:
     return ModuleSpec(
         module=FalconTransformerLayer,
-        submodules=FalconTransformerLayerSubmodules(
+        submodules=TransformerLayerSubmodules(
             input_layernorm=TENorm,
             self_attention=ModuleSpec(
                 module=SelfAttention,
@@ -46,7 +47,6 @@ def get_gpt_layer_with_transformer_engine_spec() -> ModuleSpec:
                 ),
             ),
             self_attn_bda=get_bias_dropout_add,
-            post_self_attn_layernorm=TENorm,
             pre_mlp_layernorm=TENorm,
             mlp=ModuleSpec(
                 module=MLP,

From b5234409b1dfa6c8357fb3eebe360b7236cbcbf1 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Sun, 3 Dec 2023 03:11:25 -0800
Subject: [PATCH 47/69] fixes according to comments

---
 .../megatron/falcon/falcon_decoder_layer.py   |  1 -
 .../megatron/falcon/falcon_spec.py            |  2 +-
 .../language_modeling/megatron_gpt_model.py   | 24 ++++++-------------
 3 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 1e3c83f6f394..b20a493f5b08 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -94,7 +94,6 @@ def forward(
         if self.new_decoder_architecture:
             mlp_ln_output = self.pre_mlp_layernorm(hidden_states)
 
-        # Optional Input Layer norm
         input_layernorm_output = self.input_layernorm(hidden_states)
 
         input_mlp_ln = input_layernorm_output
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index e53092e86a6b..22238a9152be 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -32,7 +32,7 @@
 from .falcon_decoder_layer import FalconTransformerLayer
 
 # Use this spec for an implementation using modules in TE
-def get_gpt_layer_with_transformer_engine_spec() -> ModuleSpec:
+def get_falcon_layer_spec() -> ModuleSpec:
     return ModuleSpec(
         module=FalconTransformerLayer,
         submodules=TransformerLayerSubmodules(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 46f70d248c84..a9cec5f9bfc6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -58,6 +58,7 @@
 )
 from nemo.collections.nlp.parts import utils_funcs
 from nemo.collections.nlp.parts.utils_funcs import activation_to_func, get_last_rank
+from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
 from nemo.core.classes import Exportable
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.neural_types import ChannelType, NeuralType
@@ -104,25 +105,14 @@
     HAVE_TE = False
 
 
-def get_specs(
-    spec_name, spec_func='get_gpt_layer_with_transformer_engine_spec'
-):  # Assumes the default spec function name
-    import importlib.util
-
+def get_specs(spec_name):
     name_spec_dict = {
-        "": "megatron.core.models.gpt.gpt_layer_specs",  # default GPT
-        "megatron_falcon_gpt": "nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec",  # Other customized model spec locations
+        "":get_gpt_layer_with_transformer_engine_spec(),
+        "megatron_falcon_gpt":get_falcon_layer_spec()
     }
-    module_path = name_spec_dict.get(spec_name)
-    if not module_path:
-        raise ImportError(f"Failed to import {spec_name}, please ensure {spec_name} is supported.")
-
-    module = importlib.import_module(module_path)
-    try:
-        spec = getattr(module, spec_func)()
-    except AttributeError:
-        raise ImportError(f"Module {module_path} does not have {spec_func}")
-    return spec
+    if spec_name not in name_spec_dict:
+        raise ValueError(f"Spec name '{spec_name}' is not recognized.")
+    return name_spec_dict[spec_name]
 
 
 class MegatronGPTExportableModel(torch.nn.Module, Exportable):

From 8ebf142a47348d81766272e26551960d4282cf71 Mon Sep 17 00:00:00 2001
From: Vivian Chen <xuanzic@nvidia.com>
Date: Mon, 4 Dec 2023 17:45:52 +0000
Subject: [PATCH 48/69]  add falcon ci test

Signed-off-by: Vivian Chen <xuanzic@nvidia.com>
---
 Jenkinsfile | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 12fafac57a67..ad4a5955985f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -143,6 +143,15 @@ pipeline {
             sh 'rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo'
           }
         }
+        stage('Falcon') {
+          steps {
+            sh 'python scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py \
+            --config examples/nlp/language_modeling/conf/megatron_falcon_config.yaml \
+            --input /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
+            --output /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo'
+            sh 'rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo'
+          }
+        }
       }
     }
 

From 1c1bc512821b4fcf79808543bda14cc3c115a447 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Mon, 4 Dec 2023 10:23:40 -0800
Subject: [PATCH 49/69] add post_self_attn_layernorm

---
 .../megatron/falcon/falcon_decoder_layer.py           |  9 +++++++++
 .../language_modeling/megatron/falcon/falcon_spec.py  | 11 +++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index b20a493f5b08..c2732423d73e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -67,6 +67,15 @@ def __init__(
         else:
             self.parallel_attention = None
 
+        if self.new_decoder_architecture or self.parallel_attention:
+            self.post_self_attn_layernorm = None
+        else:
+            self.post_self_attn_layernorm = build_module(
+                submodules.post_self_attn_layernorm,
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
         if self.new_decoder_architecture:
             self.pre_mlp_layernorm = build_module(
                 submodules.pre_mlp_layernorm,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 22238a9152be..51afb58b84c9 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -33,9 +33,7 @@
 
 # Use this spec for an implementation using modules in TE
 def get_falcon_layer_spec() -> ModuleSpec:
-    return ModuleSpec(
-        module=FalconTransformerLayer,
-        submodules=TransformerLayerSubmodules(
+    falcon_submodules = TransformerLayerSubmodules(
             input_layernorm=TENorm,
             self_attention=ModuleSpec(
                 module=SelfAttention,
@@ -53,5 +51,10 @@ def get_falcon_layer_spec() -> ModuleSpec:
                 submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
             ),
             mlp_bda=get_bias_dropout_add,
-        ),
+        )
+    #Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules.
+    falcon_submodules.post_self_attn_layernorm = TENorm
+    return ModuleSpec(
+        module=FalconTransformerLayer,
+        submodules=falcon_submodules
     )

From f84fee6bcf42f8ddd0c54c97138a9d13bb7fd0d1 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Thu, 7 Dec 2023 13:48:43 -0800
Subject: [PATCH 50/69] add explicit explanation/refs for handling lora logic

---
 .../common/megatron/adapters/mcore_mixins.py   | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index 588135df4f50..d36c9c7dbf09 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -21,6 +21,7 @@
 from megatron.core.transformer.mlp import MLP
 from megatron.core.transformer.transformer_layer import TransformerLayer
 from megatron.core.utils import make_viewless_tensor
+from megatron.core.transformer.custom_layers.transformer_engine import TELayerNormColumnParallelLinear, TEColumnParallelLinear
 
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
     AdapterName,
@@ -65,19 +66,28 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
         # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
         linear_qkv_output, _ = self.linear_qkv(hidden_states)
         layernorm_output = None
-        if isinstance(linear_qkv_output, tuple):  # if LN and linear fused, both will be returned
+
+        # In megatron/core/models/gpt/gpt_layer_specs.py TELayerNormColumnParallelLinear is used for linear_qkv.
+        # TELayerNormColumnParallelLinear fused LN and linear, both will be returned.
+        # In nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py TEColumnParallelLinear is used for linear_qkv,
+        # which only returns linear.
+        if isinstance(self.linear_qkv, TELayerNormColumnParallelLinear):  
             mixed_qkv, layernorm_output = linear_qkv_output
-        else:  # otherwise only mixed_qkv
+        elif isinstance(self.linear_qkv, TEColumnParallelLinear):  # only mixed_qkv
             mixed_qkv = linear_qkv_output
+        else:
+            raise ValueError(f"Unrecognized module type '{type(self.linear_qkv)}' when getting query, key, value tensors for mcore mixins. ")
 
         # LoRA logic
         if self.is_adapter_available():
             lora_kqv_adapter = self.get_adapter_module(AdapterName.LORA_KQV_ADAPTER)
             if lora_kqv_adapter:
-                if layernorm_output:
+                if isinstance(self.linear_qkv, TELayerNormColumnParallelLinear):
                     lora_mixed_qkv = lora_kqv_adapter(layernorm_output)
-                else:
+                elif isinstance(self.linear_qkv, TEColumnParallelLinear):
                     lora_mixed_qkv = lora_kqv_adapter(hidden_states)
+                else:
+                    raise ValueError(f"Unrecognized module type '{type(self.linear_qkv)}' when applying lora.")
                 mixed_qkv = mixed_qkv + lora_mixed_qkv
 
         # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]

From ea39e6874bdfe86cae3040843ecf79fdd4d06bea Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 Dec 2023 23:10:33 +0000
Subject: [PATCH 51/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../megatron/falcon/falcon_decoder_layer.py   |  3 +-
 .../megatron/falcon/falcon_spec.py            | 40 +++++++++----------
 .../language_modeling/megatron_gpt_model.py   |  7 +---
 .../common/megatron/adapters/mcore_mixins.py  | 11 +++--
 4 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index c2732423d73e..5f3c46961c85 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -21,9 +21,9 @@
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.transformer.attention import SelfAttentionSubmodules
 from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 from megatron.core.utils import make_viewless_tensor
 
 """ We use the following notation throughout this file:
@@ -41,6 +41,7 @@
         hyperparameters: transformer hyperparameters
 """
 
+
 class FalconTransformerLayer(TransformerLayer):
     """A single transformer layer.
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 51afb58b84c9..ea265b03685e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -34,27 +34,23 @@
 # Use this spec for an implementation using modules in TE
 def get_falcon_layer_spec() -> ModuleSpec:
     falcon_submodules = TransformerLayerSubmodules(
-            input_layernorm=TENorm,
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.causal},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=TEColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
-                    linear_proj=TERowParallelLinear,
-                ),
+        input_layernorm=TENorm,
+        self_attention=ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.causal},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=TEColumnParallelLinear,
+                core_attention=TEDotProductAttention,
+                linear_proj=TERowParallelLinear,
             ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=TENorm,
-            mlp=ModuleSpec(
-                module=MLP,
-                submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
-            ),
-            mlp_bda=get_bias_dropout_add,
-        )
-    #Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules.
-    falcon_submodules.post_self_attn_layernorm = TENorm
-    return ModuleSpec(
-        module=FalconTransformerLayer,
-        submodules=falcon_submodules
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        pre_mlp_layernorm=TENorm,
+        mlp=ModuleSpec(
+            module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
+        ),
+        mlp_bda=get_bias_dropout_add,
     )
+    # Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules.
+    falcon_submodules.post_self_attn_layernorm = TENorm
+    return ModuleSpec(module=FalconTransformerLayer, submodules=falcon_submodules)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index e97905b9c180..a70ef40ae262 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -31,6 +31,7 @@
     MegatronPretrainingSampler,
 )
 from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import build_train_valid_test_datasets
+from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
@@ -58,7 +59,6 @@
 )
 from nemo.collections.nlp.parts import utils_funcs
 from nemo.collections.nlp.parts.utils_funcs import activation_to_func, get_last_rank
-from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
 from nemo.core.classes import Exportable
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.neural_types import ChannelType, NeuralType
@@ -106,10 +106,7 @@
 
 
 def get_specs(spec_name):
-    name_spec_dict = {
-        "":get_gpt_layer_with_transformer_engine_spec(),
-        "megatron_falcon_gpt":get_falcon_layer_spec()
-    }
+    name_spec_dict = {"": get_gpt_layer_with_transformer_engine_spec(), "megatron_falcon_gpt": get_falcon_layer_spec()}
     if spec_name not in name_spec_dict:
         raise ValueError(f"Spec name '{spec_name}' is not recognized.")
     return name_spec_dict[spec_name]
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index d36c9c7dbf09..eb86c8324dcd 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -18,10 +18,13 @@
 from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
 from megatron.core.transformer.attention import SelfAttention
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TELayerNormColumnParallelLinear,
+)
 from megatron.core.transformer.mlp import MLP
 from megatron.core.transformer.transformer_layer import TransformerLayer
 from megatron.core.utils import make_viewless_tensor
-from megatron.core.transformer.custom_layers.transformer_engine import TELayerNormColumnParallelLinear, TEColumnParallelLinear
 
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
     AdapterName,
@@ -71,12 +74,14 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
         # TELayerNormColumnParallelLinear fused LN and linear, both will be returned.
         # In nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py TEColumnParallelLinear is used for linear_qkv,
         # which only returns linear.
-        if isinstance(self.linear_qkv, TELayerNormColumnParallelLinear):  
+        if isinstance(self.linear_qkv, TELayerNormColumnParallelLinear):
             mixed_qkv, layernorm_output = linear_qkv_output
         elif isinstance(self.linear_qkv, TEColumnParallelLinear):  # only mixed_qkv
             mixed_qkv = linear_qkv_output
         else:
-            raise ValueError(f"Unrecognized module type '{type(self.linear_qkv)}' when getting query, key, value tensors for mcore mixins. ")
+            raise ValueError(
+                f"Unrecognized module type '{type(self.linear_qkv)}' when getting query, key, value tensors for mcore mixins. "
+            )
 
         # LoRA logic
         if self.is_adapter_available():

From aea1e81b70822f6e0b097579599228178669f71a Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Sat, 9 Dec 2023 11:15:04 -0800
Subject: [PATCH 52/69] fixes for code scanning

---
 .../language_modeling/megatron/falcon/falcon_decoder_layer.py | 3 +--
 .../models/language_modeling/megatron/falcon/falcon_spec.py   | 4 ----
 .../nlp/models/language_modeling/megatron_gpt_model.py        | 1 -
 3 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 5f3c46961c85..13a460e1b285 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -18,8 +18,6 @@
 import torch
 from megatron.core import parallel_state
 from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
-from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron.core.transformer.attention import SelfAttentionSubmodules
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -101,6 +99,7 @@ def forward(
         # Residual connection.
         residual = hidden_states
 
+        mlp_ln_output = None
         if self.new_decoder_architecture:
             mlp_ln_output = self.pre_mlp_layernorm(hidden_states)
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index ea265b03685e..28c96bb12af3 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -13,17 +13,13 @@
 # limitations under the License.
 
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
-from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 from megatron.core.transformer.custom_layers.transformer_engine import (
     TEColumnParallelLinear,
     TEDotProductAttention,
-    TELayerNormColumnParallelLinear,
     TENorm,
     TERowParallelLinear,
 )
-from megatron.core.transformer.dot_product_attention import DotProductAttention
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index a70ef40ae262..775754fb5f33 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -80,7 +80,6 @@
     from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
-    from megatron.core.transformer.spec_utils import import_module
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import init_method_normal, scaled_init_method_normal
 

From b0966c1394935a318dc38aef986ea9913313e729 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Mon, 11 Dec 2023 21:59:03 -0800
Subject: [PATCH 53/69] remove unused imports

---
 .../megatron/falcon/falcon_decoder_layer.py                 | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 13a460e1b285..cbd3430adba8 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -12,14 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
-from typing import Union
-
-import torch
 from megatron.core import parallel_state
 from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
 from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.spec_utils import build_module
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 from megatron.core.utils import make_viewless_tensor

From 0c9a2e369efb45bf77ba82fb8353139f4c96eefb Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Tue, 12 Dec 2023 14:53:28 -0800
Subject: [PATCH 54/69] unit test for falcon model

---
 tests/collections/nlp/test_falcon_model.py | 277 +++++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100644 tests/collections/nlp/test_falcon_model.py

diff --git a/tests/collections/nlp/test_falcon_model.py b/tests/collections/nlp/test_falcon_model.py
new file mode 100644
index 000000000000..00317bae3fc4
--- /dev/null
+++ b/tests/collections/nlp/test_falcon_model.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+import torch
+from omegaconf import DictConfig
+from pytorch_lightning import Trainer
+
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+
+DEVICE_CAPABILITY = None
+if torch.cuda.is_available():
+    DEVICE_CAPABILITY = torch.cuda.get_device_capability()
+
+
+@pytest.fixture()
+def model_cfg(test_data_dir):
+
+    model_cfg = {
+        'mcore_gpt': True,
+        'micro_batch_size': 1,
+        'global_batch_size': 1,
+        'rampup_batch_size': None,
+        'tensor_model_parallel_size': 1,
+        'pipeline_model_parallel_size': 1,
+        'virtual_pipeline_model_parallel_size': None,
+        'encoder_seq_length': 512,
+        'max_position_embeddings': 512,
+        'num_layers': 1,
+        'hidden_size': 128,
+        'ffn_hidden_size': 512,
+        'num_attention_heads': 2,
+        'num_query_groups': 1,
+        'init_method_std': 0.02,
+        'use_scaled_init_method': True,
+        'hidden_dropout': 0.0,
+        'attention_dropout': 0.0,
+        'ffn_dropout': 0,
+        'kv_channels': None,
+        'apply_query_key_layer_scaling': False,
+        'normalization': 'layernorm',
+        'layernorm_epsilon': 1e-05,
+        'do_layer_norm_weight_decay': False,
+        'make_vocab_size_divisible_by': 128,
+        'pre_process': True,
+        'post_process': True,
+        'persist_layer_norm': True,
+        'bias': False,
+        'activation': 'gelu',
+        'headscale': False,
+        'transformer_block_type': 'pre_ln',
+        'openai_gelu': False,
+        'normalize_attention_scores': True,
+        'position_embedding_type': 'rope',
+        'rotary_percentage': 1.0,
+        'attention_type': 'multihead',
+        'share_embeddings_and_output_weights': False,
+        'overlap_p2p_comm': False,
+        'batch_p2p_comm': True,
+        'seq_len_interpolation_factor': None,
+        'tokenizer': {
+            'library': 'huggingface', 
+            'type': 'tiiuae/falcon-40b', 
+            'use_fast': True
+            },
+        'native_amp_init_scale': 4294967296,
+        'native_amp_growth_interval': 1000,
+        'hysteresis': 2,
+        'fp32_residual_connection': False,
+        'fp16_lm_cross_entropy': False,
+        'megatron_amp_O2': False,
+        'grad_allreduce_chunk_size_mb': 125,
+        'grad_div_ar_fusion': True,
+        'gradient_accumulation_fusion': False,
+        'bias_activation_fusion': False,
+        'bias_dropout_add_fusion': False,
+        'masked_softmax_fusion': True,
+        'get_attention_mask_from_fusion': True,
+        'seed': 1234,
+        'resume_from_checkpoint': None,
+        'use_cpu_initialization': False,
+        'onnx_safe': False,
+        'apex_transformer_log_level': 30,
+        'gradient_as_bucket_view': True,
+        'sync_batch_comm': False,
+        'activations_checkpoint_granularity': None,
+        'activations_checkpoint_method': None,
+        'activations_checkpoint_num_layers': None,
+        'num_micro_batches_with_partial_activation_checkpoints': None,
+        'activations_checkpoint_layers_per_pipeline': None,
+        'sequence_parallel': False,
+        'transformer_engine': True,
+        'fp8': False,
+        'fp8_e4m3': False,
+        'fp8_hybrid': False,
+        'fp8_margin': 0,
+        'fp8_interval': 1,
+        'fp8_amax_history_len': 1,
+        'fp8_amax_compute_algo': 'most_recent',
+        'reduce_amax': True,
+        'use_emha': False,
+        'ub_tp_comm_overlap': False,
+        'ub_tp_comm_overlap_cfg': None,
+        'use_flash_attention': False,
+        'nsys_profile': {
+            'enabled': False, 
+            'start_step': 10, 
+            'end_step': 10, 
+            'ranks': [0], 
+            'gen_shape': False},
+        'optim': {
+            'name': 'distributed_fused_adam', 
+            'lr': '2e-4', 
+            'weight_decay': 0.01, 
+            'betas': [0.9, 0.98], 
+            'sched': {
+                'name': 'CosineAnnealing', 
+                'warmup_steps': 500, 
+                'constant_steps': 50000, 
+                'min_lr': '2e-5'}
+                },
+        'gc_interval': 0,
+        'precision': 'bf16',
+        'new_decoder_architecture': False,
+        'parallel_attention': True,
+        'name': 'megatron_falcon_gpt',
+        'target': 'nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel',
+    }
+    return model_cfg
+
+
+@pytest.fixture()
+def trainer_cfg():
+
+    trainer_cfg = {
+        'devices': 1,
+        'num_nodes': 1,
+        'accelerator': 'gpu',
+        'precision': 'bf16',
+        'logger': False,
+        'enable_checkpointing': False,
+        'use_distributed_sampler': False,
+        'max_epochs': 1000,
+        'max_steps': 100000,
+        'log_every_n_steps': 10,
+        'val_check_interval': 100,
+        'limit_val_batches': 50,
+        'limit_test_batches': 500,
+        'accumulate_grad_batches': 1,
+        'gradient_clip_val': 1.0,
+    }
+
+    return trainer_cfg
+
+
+@pytest.fixture()
+def precision():
+    return 'bf16'
+
+
+@pytest.fixture()
+def falcon_gpt_model(model_cfg, trainer_cfg, precision):
+    model_cfg['precision'] = precision
+    trainer_cfg['precision'] = precision
+
+    strategy = NLPDDPStrategy()
+
+    trainer = Trainer(strategy=strategy, **trainer_cfg)
+
+    cfg = DictConfig(model_cfg)
+
+    model = MegatronGPTModel(cfg=cfg, trainer=trainer)
+
+    return model
+
+
+@pytest.fixture()
+def test_text():
+    test_text = [
+        "hello, world",
+        "four score and seven years ago",
+        "Your time is limited",
+        "If you set goals rediculously high",
+    ]
+    return test_text
+
+
+@pytest.mark.run_only_on('GPU')
+class TestFalconGPTModel:
+    @pytest.mark.unit
+    def test_constructor(self, falcon_gpt_model):
+        assert isinstance(falcon_gpt_model, MegatronGPTModel)
+
+        num_weights = falcon_gpt_model.num_weights
+        assert num_weights == 16827136
+
+    @pytest.mark.unit
+    def test_tokenizer(self, falcon_gpt_model, test_text):
+
+        assert isinstance(falcon_gpt_model.tokenizer, AutoTokenizer)
+        assert falcon_gpt_model.tokenizer.name == 'PreTrainedTokenizerFast'
+        assert falcon_gpt_model.tokenizer.vocab_size == 65024
+
+        ids = [falcon_gpt_model.tokenizer.text_to_ids(text) for text in test_text]
+
+        true_ids = [
+            [30835, 23, 1079],
+            [18584, 5179, 273, 5144, 909, 2323],
+            [4560, 601, 304, 3991],
+            [1424, 299, 889, 4258, 2400, 276, 20201, 986],
+        ]
+        assert sum([id_list == true_id_list for id_list, true_id_list in zip(ids, true_ids)]) == 4
+
+    @pytest.mark.parametrize(
+        "precision",
+        [
+            32,
+            16,
+            pytest.param(
+                "bf16",
+                marks=pytest.mark.skipif(
+                    not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8,
+                    reason='bfloat16 is not supported on this device',
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.unit
+    def test_forward(self, falcon_gpt_model, test_text):
+
+        dtype = falcon_gpt_model.torch_dtype
+
+        falcon_gpt_model.eval()
+
+        ids = [falcon_gpt_model.tokenizer.text_to_ids(text) for text in test_text]
+
+        id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids]
+
+        masks_and_position_ids = [
+            get_ltor_masks_and_position_ids(id_tensor, falcon_gpt_model.tokenizer.eos_id, False, False, False)
+            for id_tensor in id_tensors
+        ]
+        output_tensors = []
+        with torch.no_grad():
+            for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids):
+                attn_mask, _, pos_ids = attn_mask_and_pos_ids
+                assert tokens.shape == pos_ids.shape
+                assert attn_mask.shape[2] == attn_mask.shape[3] == tokens.shape[1] == pos_ids.shape[1]
+                with torch.autocast('cuda', dtype=dtype):
+                    output_tensor = falcon_gpt_model.forward(
+                        tokens=tokens.cuda(),
+                        text_position_ids=pos_ids.cuda(),
+                        attention_mask=attn_mask.cuda(),
+                        labels=None,
+                    )
+                # output is [b s h]
+                assert output_tensor.shape[0] == 1
+                assert output_tensor.shape[1] == tokens.shape[1]
+                assert output_tensor.shape[2] == falcon_gpt_model.padded_vocab_size
+                assert output_tensor.dtype == dtype
+                output_tensors.append(output_tensor)

From 8e8ba6624c42d8bb718bbb204a07b1d5486d7e83 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 12 Dec 2023 23:03:55 +0000
Subject: [PATCH 55/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/collections/nlp/test_falcon_model.py | 29 ++++++----------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/tests/collections/nlp/test_falcon_model.py b/tests/collections/nlp/test_falcon_model.py
index 00317bae3fc4..a36b64e82271 100644
--- a/tests/collections/nlp/test_falcon_model.py
+++ b/tests/collections/nlp/test_falcon_model.py
@@ -74,11 +74,7 @@ def model_cfg(test_data_dir):
         'overlap_p2p_comm': False,
         'batch_p2p_comm': True,
         'seq_len_interpolation_factor': None,
-        'tokenizer': {
-            'library': 'huggingface', 
-            'type': 'tiiuae/falcon-40b', 
-            'use_fast': True
-            },
+        'tokenizer': {'library': 'huggingface', 'type': 'tiiuae/falcon-40b', 'use_fast': True},
         'native_amp_init_scale': 4294967296,
         'native_amp_growth_interval': 1000,
         'hysteresis': 2,
@@ -118,23 +114,14 @@ def model_cfg(test_data_dir):
         'ub_tp_comm_overlap': False,
         'ub_tp_comm_overlap_cfg': None,
         'use_flash_attention': False,
-        'nsys_profile': {
-            'enabled': False, 
-            'start_step': 10, 
-            'end_step': 10, 
-            'ranks': [0], 
-            'gen_shape': False},
+        'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False},
         'optim': {
-            'name': 'distributed_fused_adam', 
-            'lr': '2e-4', 
-            'weight_decay': 0.01, 
-            'betas': [0.9, 0.98], 
-            'sched': {
-                'name': 'CosineAnnealing', 
-                'warmup_steps': 500, 
-                'constant_steps': 50000, 
-                'min_lr': '2e-5'}
-                },
+            'name': 'distributed_fused_adam',
+            'lr': '2e-4',
+            'weight_decay': 0.01,
+            'betas': [0.9, 0.98],
+            'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 50000, 'min_lr': '2e-5'},
+        },
         'gc_interval': 0,
         'precision': 'bf16',
         'new_decoder_architecture': False,

From b1e63982ebd1f3694f5c73ed3b03f3c21124f33a Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Tue, 12 Dec 2023 20:42:17 -0800
Subject: [PATCH 56/69] add falcon transformer layer unit test

---
 .../nlp/test_falcon_transformer_layer.py      | 118 ++++++++++++++++++
 tests/utils/test_parallel_utils.py            |  31 +++++
 2 files changed, 149 insertions(+)
 create mode 100644 tests/collections/nlp/test_falcon_transformer_layer.py
 create mode 100644 tests/utils/test_parallel_utils.py

diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py
new file mode 100644
index 000000000000..1562c7c3ac89
--- /dev/null
+++ b/tests/collections/nlp/test_falcon_transformer_layer.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
+from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer
+
+from tests.utils.test_parallel_utils import Utils
+
+class TestParallelFalconTransformerLayer:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1,1)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True)
+        self.parallel_falcon_transformer_layer = FalconTransformerLayer(transformer_config,
+                                                           get_falcon_layer_spec().submodules)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        parallel_falcon_transformer_layer = self.parallel_falcon_transformer_layer
+        assert isinstance(parallel_falcon_transformer_layer, FalconTransformerLayer)
+        assert parallel_falcon_transformer_layer.layer_number == 1
+
+        num_weights = sum([p.numel() for p in parallel_falcon_transformer_layer.parameters()])
+        assert num_weights == 1884
+
+    def test_gpu_forward(self):
+        parallel_transformer_layer = self.parallel_falcon_transformer_layer
+        config: TransformerConfig = parallel_transformer_layer.config
+        sequence_length = 32
+        micro_batch_size = 2
+        parallel_transformer_layer.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
+        hidden_states = hidden_states.cuda()
+
+        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+        hidden_states, context = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask)
+        assert hidden_states.shape[0] == sequence_length
+        assert hidden_states.shape[1] == micro_batch_size
+        assert hidden_states.shape[2] == config.hidden_size
+
+    @pytest.mark.parametrize('tp_pp', [(1, 1)])
+    def test_sharded_state_dict(self, tp_pp):
+        Utils.destroy_model_parallel()
+        Utils.initialize_model_parallel(*tp_pp)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True)
+        parallel_transformer_layer = FalconTransformerLayer(transformer_config,
+                                                      get_falcon_layer_spec().submodules)
+
+        sharded_state_dict = parallel_transformer_layer.sharded_state_dict()
+
+        extra_states = {k: v for k, v in sharded_state_dict.items() if k.endswith('extra_state')}
+        sharded_tensors = {k: v for k, v in sharded_state_dict.items() if not k.endswith('extra_state')}
+        assert all(isinstance(t, ShardedObject) for t in extra_states.values())
+        assert all(isinstance(t, ShardedTensor) for t in sharded_tensors.values())
+
+        # Test all local shapes
+        tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()}
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        ans = get_tensor_shapes_for_tp(transformer_config, tp_size)
+        assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size)
+
+        # Test all global shapes. Prepend num layers in front of expected shapes
+        tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()}
+        expected_global_shapes = {k: (transformer_config.num_layers, *v)
+                                  for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items()}
+        assert tensor_global_shapes == expected_global_shapes
+
+        # Test ShardedTensor keys
+        for state_dict_key, sh_ten in sharded_tensors.items():
+            assert state_dict_key == f'0.{sh_ten.key}'
+
+        Utils.destroy_model_parallel()
+        Utils.initialize_model_parallel(1, 1)
+
+
+def get_tensor_shapes_for_tp(transformer_config, tp_size):
+    hs = transformer_config.hidden_size
+    ffn_hs = transformer_config.ffn_hidden_size
+    return {
+        '0.input_layernorm.weight': (hs,),
+        '0.input_layernorm.bias': (hs,),
+        '0.mlp.linear_fc1.weight': (ffn_hs // tp_size, hs),
+        '0.mlp.linear_fc1.bias': (ffn_hs // tp_size,),
+        '0.mlp.linear_fc2.weight': (hs, ffn_hs // tp_size),
+        '0.mlp.linear_fc2.bias': (hs,),
+        '0.self_attention.linear_proj.weight': (hs, hs // tp_size),
+        '0.self_attention.linear_proj.bias': (hs,),
+        '0.self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs),
+        '0.self_attention.linear_qkv.bias': (hs * 3 // tp_size,),
+        '0.post_self_attn_layernorm.weight': (hs,),
+        '0.post_self_attn_layernorm.bias': (hs,)
+    }
diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py
new file mode 100644
index 000000000000..ead8ec8b744e
--- /dev/null
+++ b/tests/utils/test_parallel_utils.py
@@ -0,0 +1,31 @@
+import os
+import torch
+import megatron.core.parallel_state as ps
+
+class Utils:
+
+    world_size = 1  #one gpu for unit test
+    os.environ['LOCAL_RANK']='0'
+    rank = int(os.environ['LOCAL_RANK'])
+
+    @staticmethod
+    def initialize_distributed():
+        print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}')
+        torch.cuda.set_device(Utils.rank % torch.cuda.device_count())
+        init_method = 'tcp://'
+        master_ip = os.getenv('MASTER_ADDR', 'localhost')
+        master_port = os.getenv('MASTER_PORT', '6000')
+        init_method += master_ip + ':' + master_port
+        torch.distributed.init_process_group(backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method)
+        
+    @staticmethod
+    def destroy_model_parallel():
+        ps.destroy_model_parallel()
+        torch.distributed.barrier()
+
+    @staticmethod
+    def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None):
+        ps.destroy_model_parallel()
+        if not torch.distributed.is_initialized():
+            Utils.initialize_distributed()
+        ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank)

From 2759755187783d081a6d2b8661a14d8d18240214 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 13 Dec 2023 04:49:26 +0000
Subject: [PATCH 57/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../nlp/test_falcon_transformer_layer.py      | 32 +++++++++++--------
 tests/utils/test_parallel_utils.py            | 28 ++++++++++++----
 2 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py
index 1562c7c3ac89..80765edaa454 100644
--- a/tests/collections/nlp/test_falcon_transformer_layer.py
+++ b/tests/collections/nlp/test_falcon_transformer_layer.py
@@ -16,24 +16,26 @@
 
 import pytest
 import torch
-
 from megatron.core import parallel_state
 from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.transformer_config import TransformerConfig
-from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
-from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer
 
+from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer
+from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
 from tests.utils.test_parallel_utils import Utils
 
-class TestParallelFalconTransformerLayer:
 
+class TestParallelFalconTransformerLayer:
     def setup_method(self, method):
-        Utils.initialize_model_parallel(1,1)
+        Utils.initialize_model_parallel(1, 1)
         model_parallel_cuda_manual_seed(123)
-        transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True)
-        self.parallel_falcon_transformer_layer = FalconTransformerLayer(transformer_config,
-                                                           get_falcon_layer_spec().submodules)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+        self.parallel_falcon_transformer_layer = FalconTransformerLayer(
+            transformer_config, get_falcon_layer_spec().submodules
+        )
 
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
@@ -68,9 +70,10 @@ def test_sharded_state_dict(self, tp_pp):
         Utils.destroy_model_parallel()
         Utils.initialize_model_parallel(*tp_pp)
         model_parallel_cuda_manual_seed(123)
-        transformer_config = TransformerConfig(num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True)
-        parallel_transformer_layer = FalconTransformerLayer(transformer_config,
-                                                      get_falcon_layer_spec().submodules)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True
+        )
+        parallel_transformer_layer = FalconTransformerLayer(transformer_config, get_falcon_layer_spec().submodules)
 
         sharded_state_dict = parallel_transformer_layer.sharded_state_dict()
 
@@ -87,8 +90,9 @@ def test_sharded_state_dict(self, tp_pp):
 
         # Test all global shapes. Prepend num layers in front of expected shapes
         tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()}
-        expected_global_shapes = {k: (transformer_config.num_layers, *v)
-                                  for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items()}
+        expected_global_shapes = {
+            k: (transformer_config.num_layers, *v) for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items()
+        }
         assert tensor_global_shapes == expected_global_shapes
 
         # Test ShardedTensor keys
@@ -114,5 +118,5 @@ def get_tensor_shapes_for_tp(transformer_config, tp_size):
         '0.self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs),
         '0.self_attention.linear_qkv.bias': (hs * 3 // tp_size,),
         '0.post_self_attn_layernorm.weight': (hs,),
-        '0.post_self_attn_layernorm.bias': (hs,)
+        '0.post_self_attn_layernorm.bias': (hs,),
     }
diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py
index ead8ec8b744e..0595cf43b599 100644
--- a/tests/utils/test_parallel_utils.py
+++ b/tests/utils/test_parallel_utils.py
@@ -1,11 +1,13 @@
 import os
-import torch
+
 import megatron.core.parallel_state as ps
+import torch
+
 
 class Utils:
 
-    world_size = 1  #one gpu for unit test
-    os.environ['LOCAL_RANK']='0'
+    world_size = 1  # one gpu for unit test
+    os.environ['LOCAL_RANK'] = '0'
     rank = int(os.environ['LOCAL_RANK'])
 
     @staticmethod
@@ -16,16 +18,28 @@ def initialize_distributed():
         master_ip = os.getenv('MASTER_ADDR', 'localhost')
         master_port = os.getenv('MASTER_PORT', '6000')
         init_method += master_ip + ':' + master_port
-        torch.distributed.init_process_group(backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method)
-        
+        torch.distributed.init_process_group(
+            backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method
+        )
+
     @staticmethod
     def destroy_model_parallel():
         ps.destroy_model_parallel()
         torch.distributed.barrier()
 
     @staticmethod
-    def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None):
+    def initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        virtual_pipeline_model_parallel_size=None,
+        pipeline_model_parallel_split_rank=None,
+    ):
         ps.destroy_model_parallel()
         if not torch.distributed.is_initialized():
             Utils.initialize_distributed()
-        ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank)
+        ps.initialize_model_parallel(
+            tensor_model_parallel_size,
+            pipeline_model_parallel_size,
+            virtual_pipeline_model_parallel_size,
+            pipeline_model_parallel_split_rank,
+        )

From 5ad525c458025741ff3150aebf9172516922abef Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Wed, 13 Dec 2023 10:21:23 -0800
Subject: [PATCH 58/69] fixes for code scan

---
 tests/collections/nlp/test_falcon_model.py             | 2 --
 tests/collections/nlp/test_falcon_transformer_layer.py | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/tests/collections/nlp/test_falcon_model.py b/tests/collections/nlp/test_falcon_model.py
index a36b64e82271..860434ac772b 100644
--- a/tests/collections/nlp/test_falcon_model.py
+++ b/tests/collections/nlp/test_falcon_model.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-
 import pytest
 import torch
 from omegaconf import DictConfig
diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py
index 80765edaa454..2613e5535a5b 100644
--- a/tests/collections/nlp/test_falcon_transformer_layer.py
+++ b/tests/collections/nlp/test_falcon_transformer_layer.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-
 import pytest
 import torch
 from megatron.core import parallel_state
@@ -85,7 +83,6 @@ def test_sharded_state_dict(self, tp_pp):
         # Test all local shapes
         tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()}
         tp_size = parallel_state.get_tensor_model_parallel_world_size()
-        ans = get_tensor_shapes_for_tp(transformer_config, tp_size)
         assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size)
 
         # Test all global shapes. Prepend num layers in front of expected shapes

From 9c4960f0bb64c9b52fd96f99316e88d37844b5ca Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Wed, 13 Dec 2023 13:35:38 -0800
Subject: [PATCH 59/69] remove mcore dependent tests

---
 .../nlp/test_falcon_transformer_layer.py      | 119 ------------------
 tests/utils/test_parallel_utils.py            |  45 -------
 2 files changed, 164 deletions(-)
 delete mode 100644 tests/collections/nlp/test_falcon_transformer_layer.py
 delete mode 100644 tests/utils/test_parallel_utils.py

diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py
deleted file mode 100644
index 2613e5535a5b..000000000000
--- a/tests/collections/nlp/test_falcon_transformer_layer.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import torch
-from megatron.core import parallel_state
-from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
-from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
-from megatron.core.transformer.transformer_config import TransformerConfig
-
-from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer
-from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
-from tests.utils.test_parallel_utils import Utils
-
-
-class TestParallelFalconTransformerLayer:
-    def setup_method(self, method):
-        Utils.initialize_model_parallel(1, 1)
-        model_parallel_cuda_manual_seed(123)
-        transformer_config = TransformerConfig(
-            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
-        )
-        self.parallel_falcon_transformer_layer = FalconTransformerLayer(
-            transformer_config, get_falcon_layer_spec().submodules
-        )
-
-    def teardown_method(self, method):
-        Utils.destroy_model_parallel()
-
-    def test_constructor(self):
-        parallel_falcon_transformer_layer = self.parallel_falcon_transformer_layer
-        assert isinstance(parallel_falcon_transformer_layer, FalconTransformerLayer)
-        assert parallel_falcon_transformer_layer.layer_number == 1
-
-        num_weights = sum([p.numel() for p in parallel_falcon_transformer_layer.parameters()])
-        assert num_weights == 1884
-
-    def test_gpu_forward(self):
-        parallel_transformer_layer = self.parallel_falcon_transformer_layer
-        config: TransformerConfig = parallel_transformer_layer.config
-        sequence_length = 32
-        micro_batch_size = 2
-        parallel_transformer_layer.cuda()
-
-        # [sequence length, batch size, hidden size]
-        hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
-        hidden_states = hidden_states.cuda()
-
-        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
-        hidden_states, context = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask)
-        assert hidden_states.shape[0] == sequence_length
-        assert hidden_states.shape[1] == micro_batch_size
-        assert hidden_states.shape[2] == config.hidden_size
-
-    @pytest.mark.parametrize('tp_pp', [(1, 1)])
-    def test_sharded_state_dict(self, tp_pp):
-        Utils.destroy_model_parallel()
-        Utils.initialize_model_parallel(*tp_pp)
-        model_parallel_cuda_manual_seed(123)
-        transformer_config = TransformerConfig(
-            num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True
-        )
-        parallel_transformer_layer = FalconTransformerLayer(transformer_config, get_falcon_layer_spec().submodules)
-
-        sharded_state_dict = parallel_transformer_layer.sharded_state_dict()
-
-        extra_states = {k: v for k, v in sharded_state_dict.items() if k.endswith('extra_state')}
-        sharded_tensors = {k: v for k, v in sharded_state_dict.items() if not k.endswith('extra_state')}
-        assert all(isinstance(t, ShardedObject) for t in extra_states.values())
-        assert all(isinstance(t, ShardedTensor) for t in sharded_tensors.values())
-
-        # Test all local shapes
-        tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()}
-        tp_size = parallel_state.get_tensor_model_parallel_world_size()
-        assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size)
-
-        # Test all global shapes. Prepend num layers in front of expected shapes
-        tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()}
-        expected_global_shapes = {
-            k: (transformer_config.num_layers, *v) for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items()
-        }
-        assert tensor_global_shapes == expected_global_shapes
-
-        # Test ShardedTensor keys
-        for state_dict_key, sh_ten in sharded_tensors.items():
-            assert state_dict_key == f'0.{sh_ten.key}'
-
-        Utils.destroy_model_parallel()
-        Utils.initialize_model_parallel(1, 1)
-
-
-def get_tensor_shapes_for_tp(transformer_config, tp_size):
-    hs = transformer_config.hidden_size
-    ffn_hs = transformer_config.ffn_hidden_size
-    return {
-        '0.input_layernorm.weight': (hs,),
-        '0.input_layernorm.bias': (hs,),
-        '0.mlp.linear_fc1.weight': (ffn_hs // tp_size, hs),
-        '0.mlp.linear_fc1.bias': (ffn_hs // tp_size,),
-        '0.mlp.linear_fc2.weight': (hs, ffn_hs // tp_size),
-        '0.mlp.linear_fc2.bias': (hs,),
-        '0.self_attention.linear_proj.weight': (hs, hs // tp_size),
-        '0.self_attention.linear_proj.bias': (hs,),
-        '0.self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs),
-        '0.self_attention.linear_qkv.bias': (hs * 3 // tp_size,),
-        '0.post_self_attn_layernorm.weight': (hs,),
-        '0.post_self_attn_layernorm.bias': (hs,),
-    }
diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py
deleted file mode 100644
index 0595cf43b599..000000000000
--- a/tests/utils/test_parallel_utils.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import os
-
-import megatron.core.parallel_state as ps
-import torch
-
-
-class Utils:
-
-    world_size = 1  # one gpu for unit test
-    os.environ['LOCAL_RANK'] = '0'
-    rank = int(os.environ['LOCAL_RANK'])
-
-    @staticmethod
-    def initialize_distributed():
-        print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}')
-        torch.cuda.set_device(Utils.rank % torch.cuda.device_count())
-        init_method = 'tcp://'
-        master_ip = os.getenv('MASTER_ADDR', 'localhost')
-        master_port = os.getenv('MASTER_PORT', '6000')
-        init_method += master_ip + ':' + master_port
-        torch.distributed.init_process_group(
-            backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method
-        )
-
-    @staticmethod
-    def destroy_model_parallel():
-        ps.destroy_model_parallel()
-        torch.distributed.barrier()
-
-    @staticmethod
-    def initialize_model_parallel(
-        tensor_model_parallel_size=1,
-        pipeline_model_parallel_size=1,
-        virtual_pipeline_model_parallel_size=None,
-        pipeline_model_parallel_split_rank=None,
-    ):
-        ps.destroy_model_parallel()
-        if not torch.distributed.is_initialized():
-            Utils.initialize_distributed()
-        ps.initialize_model_parallel(
-            tensor_model_parallel_size,
-            pipeline_model_parallel_size,
-            virtual_pipeline_model_parallel_size,
-            pipeline_model_parallel_split_rank,
-        )

From fb048066e9688b945c2b6ecbd715687f5b79c012 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Wed, 13 Dec 2023 14:25:40 -0800
Subject: [PATCH 60/69] Revert "remove mcore dependent tests"

This reverts commit 9c4960f0bb64c9b52fd96f99316e88d37844b5ca.
---
 .../nlp/test_falcon_transformer_layer.py      | 119 ++++++++++++++++++
 tests/utils/test_parallel_utils.py            |  45 +++++++
 2 files changed, 164 insertions(+)
 create mode 100644 tests/collections/nlp/test_falcon_transformer_layer.py
 create mode 100644 tests/utils/test_parallel_utils.py

diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py
new file mode 100644
index 000000000000..2613e5535a5b
--- /dev/null
+++ b/tests/collections/nlp/test_falcon_transformer_layer.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer
+from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
+from tests.utils.test_parallel_utils import Utils
+
+
+class TestParallelFalconTransformerLayer:
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+        self.parallel_falcon_transformer_layer = FalconTransformerLayer(
+            transformer_config, get_falcon_layer_spec().submodules
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        parallel_falcon_transformer_layer = self.parallel_falcon_transformer_layer
+        assert isinstance(parallel_falcon_transformer_layer, FalconTransformerLayer)
+        assert parallel_falcon_transformer_layer.layer_number == 1
+
+        num_weights = sum([p.numel() for p in parallel_falcon_transformer_layer.parameters()])
+        assert num_weights == 1884
+
+    def test_gpu_forward(self):
+        parallel_transformer_layer = self.parallel_falcon_transformer_layer
+        config: TransformerConfig = parallel_transformer_layer.config
+        sequence_length = 32
+        micro_batch_size = 2
+        parallel_transformer_layer.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
+        hidden_states = hidden_states.cuda()
+
+        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+        hidden_states, context = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask)
+        assert hidden_states.shape[0] == sequence_length
+        assert hidden_states.shape[1] == micro_batch_size
+        assert hidden_states.shape[2] == config.hidden_size
+
+    @pytest.mark.parametrize('tp_pp', [(1, 1)])
+    def test_sharded_state_dict(self, tp_pp):
+        Utils.destroy_model_parallel()
+        Utils.initialize_model_parallel(*tp_pp)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True
+        )
+        parallel_transformer_layer = FalconTransformerLayer(transformer_config, get_falcon_layer_spec().submodules)
+
+        sharded_state_dict = parallel_transformer_layer.sharded_state_dict()
+
+        extra_states = {k: v for k, v in sharded_state_dict.items() if k.endswith('extra_state')}
+        sharded_tensors = {k: v for k, v in sharded_state_dict.items() if not k.endswith('extra_state')}
+        assert all(isinstance(t, ShardedObject) for t in extra_states.values())
+        assert all(isinstance(t, ShardedTensor) for t in sharded_tensors.values())
+
+        # Test all local shapes
+        tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()}
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size)
+
+        # Test all global shapes. Prepend num layers in front of expected shapes
+        tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()}
+        expected_global_shapes = {
+            k: (transformer_config.num_layers, *v) for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items()
+        }
+        assert tensor_global_shapes == expected_global_shapes
+
+        # Test ShardedTensor keys
+        for state_dict_key, sh_ten in sharded_tensors.items():
+            assert state_dict_key == f'0.{sh_ten.key}'
+
+        Utils.destroy_model_parallel()
+        Utils.initialize_model_parallel(1, 1)
+
+
+def get_tensor_shapes_for_tp(transformer_config, tp_size):
+    hs = transformer_config.hidden_size
+    ffn_hs = transformer_config.ffn_hidden_size
+    return {
+        '0.input_layernorm.weight': (hs,),
+        '0.input_layernorm.bias': (hs,),
+        '0.mlp.linear_fc1.weight': (ffn_hs // tp_size, hs),
+        '0.mlp.linear_fc1.bias': (ffn_hs // tp_size,),
+        '0.mlp.linear_fc2.weight': (hs, ffn_hs // tp_size),
+        '0.mlp.linear_fc2.bias': (hs,),
+        '0.self_attention.linear_proj.weight': (hs, hs // tp_size),
+        '0.self_attention.linear_proj.bias': (hs,),
+        '0.self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs),
+        '0.self_attention.linear_qkv.bias': (hs * 3 // tp_size,),
+        '0.post_self_attn_layernorm.weight': (hs,),
+        '0.post_self_attn_layernorm.bias': (hs,),
+    }
diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py
new file mode 100644
index 000000000000..0595cf43b599
--- /dev/null
+++ b/tests/utils/test_parallel_utils.py
@@ -0,0 +1,45 @@
+import os
+
+import megatron.core.parallel_state as ps
+import torch
+
+
+class Utils:
+
+    world_size = 1  # one gpu for unit test
+    os.environ['LOCAL_RANK'] = '0'
+    rank = int(os.environ['LOCAL_RANK'])
+
+    @staticmethod
+    def initialize_distributed():
+        print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}')
+        torch.cuda.set_device(Utils.rank % torch.cuda.device_count())
+        init_method = 'tcp://'
+        master_ip = os.getenv('MASTER_ADDR', 'localhost')
+        master_port = os.getenv('MASTER_PORT', '6000')
+        init_method += master_ip + ':' + master_port
+        torch.distributed.init_process_group(
+            backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method
+        )
+
+    @staticmethod
+    def destroy_model_parallel():
+        ps.destroy_model_parallel()
+        torch.distributed.barrier()
+
+    @staticmethod
+    def initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        virtual_pipeline_model_parallel_size=None,
+        pipeline_model_parallel_split_rank=None,
+    ):
+        ps.destroy_model_parallel()
+        if not torch.distributed.is_initialized():
+            Utils.initialize_distributed()
+        ps.initialize_model_parallel(
+            tensor_model_parallel_size,
+            pipeline_model_parallel_size,
+            virtual_pipeline_model_parallel_size,
+            pipeline_model_parallel_split_rank,
+        )

From e54fdad6643e2acff0f87a4e20db7460c0be44d0 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Wed, 13 Dec 2023 18:07:40 -0800
Subject: [PATCH 61/69] add import guards

---
 .../megatron/falcon/falcon_decoder_layer.py   | 25 +++++++++++----
 .../megatron/falcon/falcon_spec.py            | 32 ++++++++++++-------
 .../nlp/test_falcon_transformer_layer.py      | 16 +++++++---
 tests/utils/test_parallel_utils.py            | 10 ++++--
 4 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index cbd3430adba8..a75a6f5e4645 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -12,13 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from megatron.core import parallel_state
-from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
-from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.spec_utils import build_module
-from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
-from megatron.core.utils import make_viewless_tensor
+try:
+    from megatron.core import parallel_state
+    from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
+    from megatron.core.transformer.enums import AttnMaskType
+    from megatron.core.transformer.spec_utils import build_module
+    from megatron.core.transformer.transformer_config import TransformerConfig
+    from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+    from megatron.core.utils import make_viewless_tensor
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
 
 """ We use the following notation throughout this file:
      h: hidden size
@@ -51,6 +58,10 @@ def __init__(
         layer_number: int = 1,
         self_attn_mask_type=AttnMaskType.padding,
     ):
+        if not HAVE_MEGATRON_CORE:
+            raise ImportError(
+                "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+            )
         super().__init__(config=config, submodules=submodules, layer_number=layer_number)
 
         if hasattr(self.config, 'new_decoder_architecture'):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 28c96bb12af3..88a4b5a7bb7c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -12,19 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-from megatron.core.transformer.custom_layers.transformer_engine import (
-    TEColumnParallelLinear,
-    TEDotProductAttention,
-    TENorm,
-    TERowParallelLinear,
-)
-from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.mlp import MLP, MLPSubmodules
-from megatron.core.transformer.spec_utils import ModuleSpec
+try:
+    from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+    from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+    from megatron.core.transformer.custom_layers.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TENorm,
+        TERowParallelLinear,
+    )
+    from megatron.core.transformer.enums import AttnMaskType
+    from megatron.core.transformer.mlp import MLP, MLPSubmodules
+    from megatron.core.transformer.spec_utils import ModuleSpec
+
+    from megatron.core.transformer.transformer_layer import TransformerLayerSubmodules
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
 
-from megatron.core.transformer.transformer_layer import TransformerLayerSubmodules
 from .falcon_decoder_layer import FalconTransformerLayer
 
 # Use this spec for an implementation using modules in TE
diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py
index 2613e5535a5b..d17597f7fd08 100644
--- a/tests/collections/nlp/test_falcon_transformer_layer.py
+++ b/tests/collections/nlp/test_falcon_transformer_layer.py
@@ -14,10 +14,18 @@
 
 import pytest
 import torch
-from megatron.core import parallel_state
-from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
-from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
-from megatron.core.transformer.transformer_config import TransformerConfig
+
+try:
+    from megatron.core import parallel_state
+    from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
+    from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+    from megatron.core.transformer.transformer_config import TransformerConfig
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
 
 from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer
 from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py
index 0595cf43b599..c0af612280f5 100644
--- a/tests/utils/test_parallel_utils.py
+++ b/tests/utils/test_parallel_utils.py
@@ -1,8 +1,14 @@
 import os
-
-import megatron.core.parallel_state as ps
 import torch
 
+try:
+    import megatron.core.parallel_state as ps
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
 
 class Utils:
 

From 7cd8cfb12a4f1861ea26de30ecd693c8df82ec76 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 14 Dec 2023 02:09:07 +0000
Subject: [PATCH 62/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/utils/test_parallel_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py
index c0af612280f5..eadc467fbce1 100644
--- a/tests/utils/test_parallel_utils.py
+++ b/tests/utils/test_parallel_utils.py
@@ -10,6 +10,7 @@
 
     HAVE_MEGATRON_CORE = False
 
+
 class Utils:
 
     world_size = 1  # one gpu for unit test

From beada8c6014489cc598f96c55d743cd3e0b3c325 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Thu, 14 Dec 2023 00:29:54 -0800
Subject: [PATCH 63/69] add import guards cont

---
 .../megatron/falcon/falcon_decoder_layer.py                | 3 +++
 .../language_modeling/megatron/falcon/falcon_spec.py       | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index a75a6f5e4645..73b4a70797ab 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -27,6 +27,9 @@
 
     HAVE_MEGATRON_CORE = False
 
+    class TransformerLayer:
+        pass
+
 """ We use the following notation throughout this file:
      h: hidden size
      n: number of attention heads
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 88a4b5a7bb7c..5022b49b07e2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -33,10 +33,17 @@
 
     HAVE_MEGATRON_CORE = False
 
+    from typing import Any
+    ModuleSpec = Any
+
 from .falcon_decoder_layer import FalconTransformerLayer
 
 # Use this spec for an implementation using modules in TE
 def get_falcon_layer_spec() -> ModuleSpec:
+    if not HAVE_MEGATRON_CORE:
+        raise ImportError(
+            "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+        )
     falcon_submodules = TransformerLayerSubmodules(
         input_layernorm=TENorm,
         self_attention=ModuleSpec(

From 5d76cf3a3cb51a25c995f3140ee4fdb8cb49fbdc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 14 Dec 2023 08:31:25 +0000
Subject: [PATCH 64/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../language_modeling/megatron/falcon/falcon_decoder_layer.py    | 1 +
 .../nlp/models/language_modeling/megatron/falcon/falcon_spec.py  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 73b4a70797ab..92db42a88fc1 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -30,6 +30,7 @@
     class TransformerLayer:
         pass
 
+
 """ We use the following notation throughout this file:
      h: hidden size
      n: number of attention heads
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 5022b49b07e2..b6ec930c2964 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -34,6 +34,7 @@
     HAVE_MEGATRON_CORE = False
 
     from typing import Any
+
     ModuleSpec = Any
 
 from .falcon_decoder_layer import FalconTransformerLayer

From 27b7694361d971306b99a8fd88950e2756458a26 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Thu, 14 Dec 2023 17:36:10 -0800
Subject: [PATCH 65/69] fixes for ci import tests and unit tests

---
 .../megatron/falcon/falcon_decoder_layer.py   | 12 ++--
 .../megatron/falcon/falcon_spec.py            |  6 +-
 tests/collections/nlp/test_falcon_model.py    |  4 +-
 .../nlp/test_falcon_transformer_layer.py      | 68 ++-----------------
 tests/utils/test_parallel_utils.py            | 52 --------------
 5 files changed, 17 insertions(+), 125 deletions(-)
 delete mode 100644 tests/utils/test_parallel_utils.py

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 92db42a88fc1..4c5e88b59680 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
 try:
     from megatron.core import parallel_state
     from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
@@ -27,9 +29,10 @@
 
     HAVE_MEGATRON_CORE = False
 
-    class TransformerLayer:
-        pass
-
+    TransformerLayer = ApexGuardDefaults
+    TransformerConfig = ApexGuardDefaults
+    TransformerLayerSubmodules = ApexGuardDefaults
+    AttnMaskType = ApexGuardDefaults()
 
 """ We use the following notation throughout this file:
      h: hidden size
@@ -46,7 +49,6 @@ class TransformerLayer:
         hyperparameters: transformer hyperparameters
 """
 
-
 class FalconTransformerLayer(TransformerLayer):
     """A single transformer layer.
 
@@ -57,7 +59,7 @@ class FalconTransformerLayer(TransformerLayer):
 
     def __init__(
         self,
-        config: TransformerConfig,  # should come from FalconTransformerConfig class
+        config: TransformerConfig,
         submodules: TransformerLayerSubmodules,
         layer_number: int = 1,
         self_attn_mask_type=AttnMaskType.padding,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index b6ec930c2964..924e5f4321e6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
 try:
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
     from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
@@ -33,9 +35,7 @@
 
     HAVE_MEGATRON_CORE = False
 
-    from typing import Any
-
-    ModuleSpec = Any
+    ModuleSpec = ApexGuardDefaults
 
 from .falcon_decoder_layer import FalconTransformerLayer
 
diff --git a/tests/collections/nlp/test_falcon_model.py b/tests/collections/nlp/test_falcon_model.py
index 860434ac772b..23430ad36300 100644
--- a/tests/collections/nlp/test_falcon_model.py
+++ b/tests/collections/nlp/test_falcon_model.py
@@ -32,8 +32,8 @@ def model_cfg(test_data_dir):
 
     model_cfg = {
         'mcore_gpt': True,
-        'micro_batch_size': 1,
-        'global_batch_size': 1,
+        'micro_batch_size': 4,
+        'global_batch_size': 8,
         'rampup_batch_size': None,
         'tensor_model_parallel_size': 1,
         'pipeline_model_parallel_size': 1,
diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py
index d17597f7fd08..609a56e14596 100644
--- a/tests/collections/nlp/test_falcon_transformer_layer.py
+++ b/tests/collections/nlp/test_falcon_transformer_layer.py
@@ -16,8 +16,7 @@
 import torch
 
 try:
-    from megatron.core import parallel_state
-    from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
+
     from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
     from megatron.core.transformer.transformer_config import TransformerConfig
 
@@ -29,12 +28,11 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer
 from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
-from tests.utils.test_parallel_utils import Utils
-
 
+@pytest.mark.run_only_on('GPU')
 class TestParallelFalconTransformerLayer:
+
     def setup_method(self, method):
-        Utils.initialize_model_parallel(1, 1)
         model_parallel_cuda_manual_seed(123)
         transformer_config = TransformerConfig(
             num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
@@ -43,9 +41,7 @@ def setup_method(self, method):
             transformer_config, get_falcon_layer_spec().submodules
         )
 
-    def teardown_method(self, method):
-        Utils.destroy_model_parallel()
-
+    @pytest.mark.unit
     def test_constructor(self):
         parallel_falcon_transformer_layer = self.parallel_falcon_transformer_layer
         assert isinstance(parallel_falcon_transformer_layer, FalconTransformerLayer)
@@ -54,6 +50,7 @@ def test_constructor(self):
         num_weights = sum([p.numel() for p in parallel_falcon_transformer_layer.parameters()])
         assert num_weights == 1884
 
+    @pytest.mark.unit
     def test_gpu_forward(self):
         parallel_transformer_layer = self.parallel_falcon_transformer_layer
         config: TransformerConfig = parallel_transformer_layer.config
@@ -70,58 +67,3 @@ def test_gpu_forward(self):
         assert hidden_states.shape[0] == sequence_length
         assert hidden_states.shape[1] == micro_batch_size
         assert hidden_states.shape[2] == config.hidden_size
-
-    @pytest.mark.parametrize('tp_pp', [(1, 1)])
-    def test_sharded_state_dict(self, tp_pp):
-        Utils.destroy_model_parallel()
-        Utils.initialize_model_parallel(*tp_pp)
-        model_parallel_cuda_manual_seed(123)
-        transformer_config = TransformerConfig(
-            num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True
-        )
-        parallel_transformer_layer = FalconTransformerLayer(transformer_config, get_falcon_layer_spec().submodules)
-
-        sharded_state_dict = parallel_transformer_layer.sharded_state_dict()
-
-        extra_states = {k: v for k, v in sharded_state_dict.items() if k.endswith('extra_state')}
-        sharded_tensors = {k: v for k, v in sharded_state_dict.items() if not k.endswith('extra_state')}
-        assert all(isinstance(t, ShardedObject) for t in extra_states.values())
-        assert all(isinstance(t, ShardedTensor) for t in sharded_tensors.values())
-
-        # Test all local shapes
-        tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()}
-        tp_size = parallel_state.get_tensor_model_parallel_world_size()
-        assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size)
-
-        # Test all global shapes. Prepend num layers in front of expected shapes
-        tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()}
-        expected_global_shapes = {
-            k: (transformer_config.num_layers, *v) for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items()
-        }
-        assert tensor_global_shapes == expected_global_shapes
-
-        # Test ShardedTensor keys
-        for state_dict_key, sh_ten in sharded_tensors.items():
-            assert state_dict_key == f'0.{sh_ten.key}'
-
-        Utils.destroy_model_parallel()
-        Utils.initialize_model_parallel(1, 1)
-
-
-def get_tensor_shapes_for_tp(transformer_config, tp_size):
-    hs = transformer_config.hidden_size
-    ffn_hs = transformer_config.ffn_hidden_size
-    return {
-        '0.input_layernorm.weight': (hs,),
-        '0.input_layernorm.bias': (hs,),
-        '0.mlp.linear_fc1.weight': (ffn_hs // tp_size, hs),
-        '0.mlp.linear_fc1.bias': (ffn_hs // tp_size,),
-        '0.mlp.linear_fc2.weight': (hs, ffn_hs // tp_size),
-        '0.mlp.linear_fc2.bias': (hs,),
-        '0.self_attention.linear_proj.weight': (hs, hs // tp_size),
-        '0.self_attention.linear_proj.bias': (hs,),
-        '0.self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs),
-        '0.self_attention.linear_qkv.bias': (hs * 3 // tp_size,),
-        '0.post_self_attn_layernorm.weight': (hs,),
-        '0.post_self_attn_layernorm.bias': (hs,),
-    }
diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py
deleted file mode 100644
index eadc467fbce1..000000000000
--- a/tests/utils/test_parallel_utils.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import os
-import torch
-
-try:
-    import megatron.core.parallel_state as ps
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-
-    HAVE_MEGATRON_CORE = False
-
-
-class Utils:
-
-    world_size = 1  # one gpu for unit test
-    os.environ['LOCAL_RANK'] = '0'
-    rank = int(os.environ['LOCAL_RANK'])
-
-    @staticmethod
-    def initialize_distributed():
-        print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}')
-        torch.cuda.set_device(Utils.rank % torch.cuda.device_count())
-        init_method = 'tcp://'
-        master_ip = os.getenv('MASTER_ADDR', 'localhost')
-        master_port = os.getenv('MASTER_PORT', '6000')
-        init_method += master_ip + ':' + master_port
-        torch.distributed.init_process_group(
-            backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method
-        )
-
-    @staticmethod
-    def destroy_model_parallel():
-        ps.destroy_model_parallel()
-        torch.distributed.barrier()
-
-    @staticmethod
-    def initialize_model_parallel(
-        tensor_model_parallel_size=1,
-        pipeline_model_parallel_size=1,
-        virtual_pipeline_model_parallel_size=None,
-        pipeline_model_parallel_split_rank=None,
-    ):
-        ps.destroy_model_parallel()
-        if not torch.distributed.is_initialized():
-            Utils.initialize_distributed()
-        ps.initialize_model_parallel(
-            tensor_model_parallel_size,
-            pipeline_model_parallel_size,
-            virtual_pipeline_model_parallel_size,
-            pipeline_model_parallel_split_rank,
-        )

From e7476e8e7d78237bfec68fd9ee077f215766eb3a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 15 Dec 2023 01:38:38 +0000
Subject: [PATCH 66/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../language_modeling/megatron/falcon/falcon_decoder_layer.py   | 1 +
 tests/collections/nlp/test_falcon_transformer_layer.py          | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 4c5e88b59680..67c732c6aee2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -49,6 +49,7 @@
         hyperparameters: transformer hyperparameters
 """
 
+
 class FalconTransformerLayer(TransformerLayer):
     """A single transformer layer.
 
diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py
index 609a56e14596..3edb541e8e33 100644
--- a/tests/collections/nlp/test_falcon_transformer_layer.py
+++ b/tests/collections/nlp/test_falcon_transformer_layer.py
@@ -29,9 +29,9 @@
 from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer
 from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
 
+
 @pytest.mark.run_only_on('GPU')
 class TestParallelFalconTransformerLayer:
-
     def setup_method(self, method):
         model_parallel_cuda_manual_seed(123)
         transformer_config = TransformerConfig(

From 90285550a47ae3cd2f8574f5840227e023f42582 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Thu, 14 Dec 2023 22:35:28 -0800
Subject: [PATCH 67/69] fixes for codeql

---
 .../megatron/falcon/falcon_decoder_layer.py   | 55 ++++++++++---------
 .../megatron/falcon/falcon_spec.py            | 41 +++++++-------
 2 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 67c732c6aee2..f02f183adea3 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -69,35 +69,36 @@ def __init__(
             raise ImportError(
                 "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
             )
-        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
-
-        if hasattr(self.config, 'new_decoder_architecture'):
-            self.new_decoder_architecture = self.config.new_decoder_architecture
-        else:
-            self.new_decoder_architecture = None
-        if hasattr(self.config, 'parallel_attention'):
-            self.parallel_attention = self.config.parallel_attention
         else:
-            self.parallel_attention = None
+            super().__init__(config=config, submodules=submodules, layer_number=layer_number)
 
-        if self.new_decoder_architecture or self.parallel_attention:
-            self.post_self_attn_layernorm = None
-        else:
-            self.post_self_attn_layernorm = build_module(
-                submodules.post_self_attn_layernorm,
-                config=self.config,
-                hidden_size=self.config.hidden_size,
-                eps=self.config.layernorm_epsilon,
-            )
-        if self.new_decoder_architecture:
-            self.pre_mlp_layernorm = build_module(
-                submodules.pre_mlp_layernorm,
-                config=self.config,
-                hidden_size=self.config.hidden_size,
-                eps=self.config.layernorm_epsilon,
-            )
-        else:
-            self.pre_mlp_layernorm = None
+            if hasattr(self.config, 'new_decoder_architecture'):
+                self.new_decoder_architecture = self.config.new_decoder_architecture
+            else:
+                self.new_decoder_architecture = None
+            if hasattr(self.config, 'parallel_attention'):
+                self.parallel_attention = self.config.parallel_attention
+            else:
+                self.parallel_attention = None
+
+            if self.new_decoder_architecture or self.parallel_attention:
+                self.post_self_attn_layernorm = None
+            else:
+                self.post_self_attn_layernorm = build_module(
+                    submodules.post_self_attn_layernorm,
+                    config=self.config,
+                    hidden_size=self.config.hidden_size,
+                    eps=self.config.layernorm_epsilon,
+                )
+            if self.new_decoder_architecture:
+                self.pre_mlp_layernorm = build_module(
+                    submodules.pre_mlp_layernorm,
+                    config=self.config,
+                    hidden_size=self.config.hidden_size,
+                    eps=self.config.layernorm_epsilon,
+                )
+            else:
+                self.pre_mlp_layernorm = None
 
     def forward(
         self,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 924e5f4321e6..ab5622547782 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -45,24 +45,25 @@ def get_falcon_layer_spec() -> ModuleSpec:
         raise ImportError(
             "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
         )
-    falcon_submodules = TransformerLayerSubmodules(
-        input_layernorm=TENorm,
-        self_attention=ModuleSpec(
-            module=SelfAttention,
-            params={"attn_mask_type": AttnMaskType.causal},
-            submodules=SelfAttentionSubmodules(
-                linear_qkv=TEColumnParallelLinear,
-                core_attention=TEDotProductAttention,
-                linear_proj=TERowParallelLinear,
+    else:
+        falcon_submodules = TransformerLayerSubmodules(
+            input_layernorm=TENorm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
             ),
-        ),
-        self_attn_bda=get_bias_dropout_add,
-        pre_mlp_layernorm=TENorm,
-        mlp=ModuleSpec(
-            module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
-        ),
-        mlp_bda=get_bias_dropout_add,
-    )
-    # Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules.
-    falcon_submodules.post_self_attn_layernorm = TENorm
-    return ModuleSpec(module=FalconTransformerLayer, submodules=falcon_submodules)
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=TENorm,
+            mlp=ModuleSpec(
+                module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
+            ),
+            mlp_bda=get_bias_dropout_add,
+        )
+        # Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules.
+        falcon_submodules.post_self_attn_layernorm = TENorm
+        return ModuleSpec(module=FalconTransformerLayer, submodules=falcon_submodules)

From 0531cff47ae948066b7ffb687b3683c205116f66 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 15 Dec 2023 06:38:41 +0000
Subject: [PATCH 68/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../models/language_modeling/megatron/falcon/falcon_spec.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index ab5622547782..6efe6d4e23c7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -60,7 +60,8 @@ def get_falcon_layer_spec() -> ModuleSpec:
             self_attn_bda=get_bias_dropout_add,
             pre_mlp_layernorm=TENorm,
             mlp=ModuleSpec(
-                module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
+                module=MLP,
+                submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
             ),
             mlp_bda=get_bias_dropout_add,
         )

From 5f866da2f93dce1c97ae7a3682cb9394bed7c49e Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Thu, 14 Dec 2023 22:52:31 -0800
Subject: [PATCH 69/69] Revert "fixes for codeql"

This reverts commit 90285550a47ae3cd2f8574f5840227e023f42582.
---
 .../megatron/falcon/falcon_decoder_layer.py   | 55 +++++++++----------
 .../megatron/falcon/falcon_spec.py            | 42 +++++++-------
 2 files changed, 47 insertions(+), 50 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index f02f183adea3..67c732c6aee2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -69,36 +69,35 @@ def __init__(
             raise ImportError(
                 "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
             )
-        else:
-            super().__init__(config=config, submodules=submodules, layer_number=layer_number)
+        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
 
-            if hasattr(self.config, 'new_decoder_architecture'):
-                self.new_decoder_architecture = self.config.new_decoder_architecture
-            else:
-                self.new_decoder_architecture = None
-            if hasattr(self.config, 'parallel_attention'):
-                self.parallel_attention = self.config.parallel_attention
-            else:
-                self.parallel_attention = None
+        if hasattr(self.config, 'new_decoder_architecture'):
+            self.new_decoder_architecture = self.config.new_decoder_architecture
+        else:
+            self.new_decoder_architecture = None
+        if hasattr(self.config, 'parallel_attention'):
+            self.parallel_attention = self.config.parallel_attention
+        else:
+            self.parallel_attention = None
 
-            if self.new_decoder_architecture or self.parallel_attention:
-                self.post_self_attn_layernorm = None
-            else:
-                self.post_self_attn_layernorm = build_module(
-                    submodules.post_self_attn_layernorm,
-                    config=self.config,
-                    hidden_size=self.config.hidden_size,
-                    eps=self.config.layernorm_epsilon,
-                )
-            if self.new_decoder_architecture:
-                self.pre_mlp_layernorm = build_module(
-                    submodules.pre_mlp_layernorm,
-                    config=self.config,
-                    hidden_size=self.config.hidden_size,
-                    eps=self.config.layernorm_epsilon,
-                )
-            else:
-                self.pre_mlp_layernorm = None
+        if self.new_decoder_architecture or self.parallel_attention:
+            self.post_self_attn_layernorm = None
+        else:
+            self.post_self_attn_layernorm = build_module(
+                submodules.post_self_attn_layernorm,
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
+        if self.new_decoder_architecture:
+            self.pre_mlp_layernorm = build_module(
+                submodules.pre_mlp_layernorm,
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.pre_mlp_layernorm = None
 
     def forward(
         self,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 6efe6d4e23c7..924e5f4321e6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -45,26 +45,24 @@ def get_falcon_layer_spec() -> ModuleSpec:
         raise ImportError(
             "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
         )
-    else:
-        falcon_submodules = TransformerLayerSubmodules(
-            input_layernorm=TENorm,
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.causal},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=TEColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
-                    linear_proj=TERowParallelLinear,
-                ),
+    falcon_submodules = TransformerLayerSubmodules(
+        input_layernorm=TENorm,
+        self_attention=ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.causal},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=TEColumnParallelLinear,
+                core_attention=TEDotProductAttention,
+                linear_proj=TERowParallelLinear,
             ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=TENorm,
-            mlp=ModuleSpec(
-                module=MLP,
-                submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
-            ),
-            mlp_bda=get_bias_dropout_add,
-        )
-        # Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules.
-        falcon_submodules.post_self_attn_layernorm = TENorm
-        return ModuleSpec(module=FalconTransformerLayer, submodules=falcon_submodules)
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        pre_mlp_layernorm=TENorm,
+        mlp=ModuleSpec(
+            module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
+        ),
+        mlp_bda=get_bias_dropout_add,
+    )
+    # Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules.
+    falcon_submodules.post_self_attn_layernorm = TENorm
+    return ModuleSpec(module=FalconTransformerLayer, submodules=falcon_submodules)