From d14e267300ef19bbc1475070ffdcce623491f95d Mon Sep 17 00:00:00 2001
From: Cathy <815244047@qq.com>
Date: Fri, 10 May 2024 07:12:27 +0800
Subject: [PATCH] Add QWen1.5/Qwen2 support  (#303)

* add qwen2 support

* update yaml

* update format

---------

Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
---
 .../autoconfig/scripts/compare_throughput.py  |   6 +-
 auto_configurator/autoconfig/search_config.py |  11 +-
 .../autoconfig/training_config.py             |  39 ++-
 auto_configurator/autoconfig/utils.py         |  11 +-
 auto_configurator/base_configs/qwen2_14b.yaml | 235 ++++++++++++++++
 auto_configurator/base_configs/qwen2_4b.yaml  | 235 ++++++++++++++++
 auto_configurator/base_configs/qwen2_72b.yaml | 235 ++++++++++++++++
 auto_configurator/base_configs/qwen2_7b.yaml  | 235 ++++++++++++++++
 .../conf/search_config/qwen2/14b.yaml         |  23 ++
 .../conf/search_config/qwen2/4b.yaml          |  22 ++
 .../conf/search_config/qwen2/72b.yaml         |  23 ++
 .../conf/search_config/qwen2/7b.yaml          |  22 ++
 .../conf/conversion/qwen2/convert_qwen2.yaml  |  22 ++
 .../conf/evaluation/qwen2/evaluate_all.yaml   |  24 ++
 .../conf/evaluation/qwen2/evaluate_boolq.yaml |  24 ++
 .../conf/fine_tuning/qwen2/squad.yaml         | 194 +++++++++++++
 launcher_scripts/conf/peft/qwen2/sft.yaml     | 263 ++++++++++++++++++
 launcher_scripts/conf/peft/qwen2/squad.yaml   | 242 ++++++++++++++++
 .../conf/training/qwen2/qwen2_14b.yaml        | 231 +++++++++++++++
 .../conf/training/qwen2/qwen2_4b.yaml         | 234 ++++++++++++++++
 .../conf/training/qwen2/qwen2_72b.yaml        | 234 ++++++++++++++++
 .../conf/training/qwen2/qwen2_7b.yaml         | 234 ++++++++++++++++
 launcher_scripts/main.py                      |   3 +
 .../eval_harness/lm_eval/models/__init__.py   |   2 +
 .../eval_harness/lm_eval/models/nemo_qwen2.py | 261 +++++++++++++++++
 launcher_scripts/nemo_launcher/core/stages.py |   9 +
 26 files changed, 3057 insertions(+), 17 deletions(-)
 create mode 100644 auto_configurator/base_configs/qwen2_14b.yaml
 create mode 100644 auto_configurator/base_configs/qwen2_4b.yaml
 create mode 100644 auto_configurator/base_configs/qwen2_72b.yaml
 create mode 100644 auto_configurator/base_configs/qwen2_7b.yaml
 create mode 100644 auto_configurator/conf/search_config/qwen2/14b.yaml
 create mode 100644 auto_configurator/conf/search_config/qwen2/4b.yaml
 create mode 100644 auto_configurator/conf/search_config/qwen2/72b.yaml
 create mode 100644 auto_configurator/conf/search_config/qwen2/7b.yaml
 create mode 100644 launcher_scripts/conf/conversion/qwen2/convert_qwen2.yaml
 create mode 100644 launcher_scripts/conf/evaluation/qwen2/evaluate_all.yaml
 create mode 100644 launcher_scripts/conf/evaluation/qwen2/evaluate_boolq.yaml
 create mode 100644 launcher_scripts/conf/fine_tuning/qwen2/squad.yaml
 create mode 100644 launcher_scripts/conf/peft/qwen2/sft.yaml
 create mode 100644 launcher_scripts/conf/peft/qwen2/squad.yaml
 create mode 100644 launcher_scripts/conf/training/qwen2/qwen2_14b.yaml
 create mode 100644 launcher_scripts/conf/training/qwen2/qwen2_4b.yaml
 create mode 100644 launcher_scripts/conf/training/qwen2/qwen2_72b.yaml
 create mode 100644 launcher_scripts/conf/training/qwen2/qwen2_7b.yaml
 create mode 100644 launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_qwen2.py

diff --git a/auto_configurator/autoconfig/scripts/compare_throughput.py b/auto_configurator/autoconfig/scripts/compare_throughput.py
index a1e9c96ab9..9771bc8045 100644
--- a/auto_configurator/autoconfig/scripts/compare_throughput.py
+++ b/auto_configurator/autoconfig/scripts/compare_throughput.py
@@ -78,12 +78,12 @@ def main(cfg):
         gbs = model_cfg.get("global_batch_size")
         enc_seq_len = (
             model_cfg.get("encoder_seq_length")
-            if model_name in ("gpt3", "bert", "llama", "baichuan2", "chatglm")
+            if model_name in ("gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2")
             else model_cfg.get("seq_length")
         )
         dec_seq_len = data_cfg.get("seq_length_dec")
 
-        if model_name in ("gpt3", "bert", "llama", "baichuan2", "chatglm"):
+        if model_name in ("gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"):
             hs = model_cfg.get("hidden_size")
             ffn_hs = None
             layers = model_cfg.get("num_layers")
@@ -249,7 +249,7 @@ def calculate_tflops(
     Bert Formula: 
         Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL))
     """
-    if model_name in ["gpt3", "llama", "baichuan2", "chatglm"]:
+    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2"]:
         # Model FLOPS calculation
         model_flops = (
             (
diff --git a/auto_configurator/autoconfig/search_config.py b/auto_configurator/autoconfig/search_config.py
index 1059be8c99..5bd83b964b 100644
--- a/auto_configurator/autoconfig/search_config.py
+++ b/auto_configurator/autoconfig/search_config.py
@@ -20,7 +20,16 @@
 from autoconfig.inference_sweep import search_inference_config
 from autoconfig.training_config import search_training_config
 
-SUPPORTED_MODELS = ["gpt3", "t5", "mt5", "bert", "llama", "baichuan2", "chatglm"]
+SUPPORTED_MODELS = [
+    "gpt3",
+    "t5",
+    "mt5",
+    "bert",
+    "llama",
+    "baichuan2",
+    "chatglm",
+    "qwen2",
+]
 
 
 def search_config(
diff --git a/auto_configurator/autoconfig/training_config.py b/auto_configurator/autoconfig/training_config.py
index 56919b981c..ac007794a4 100644
--- a/auto_configurator/autoconfig/training_config.py
+++ b/auto_configurator/autoconfig/training_config.py
@@ -81,17 +81,19 @@ def generate_grid_search_configs(
 
     # 2 * num_layers is needed because of encoder/decoder architecture.
     multiplier = (
-        1 if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"] else 2
+        1
+        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"]
+        else 2
     )
 
     seq_length = base_cfg["model"]["data"]["seq_length"]
     num_layers = (
         base_cfg["model"]["num_layers"]
-        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"]
+        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"]
         else base_cfg["model"]["encoder"]["num_layers"]
     )
 
-    if model_name in ["gpt3", "bert", "llama"]:
+    if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"]:
         act_method = base_cfg["model"].get("activations_checkpoint_method", "None")
     else:
         act_method = base_cfg["model"]["encoder"].get(
@@ -126,7 +128,14 @@ def generate_grid_search_configs(
                     base_cfg["trainer"]["num_nodes"] * base_cfg["trainer"]["devices"]
                 )
                 gbs = base_cfg["model"]["global_batch_size"]
-                if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"]:
+                if model_name in [
+                    "gpt3",
+                    "bert",
+                    "llama",
+                    "baichuan2",
+                    "chatglm",
+                    "qwen2",
+                ]:
                     att_heads = base_cfg["model"]["num_attention_heads"]
                     num_layers = base_cfg["model"]["num_layers"]
                 else:
@@ -222,7 +231,8 @@ def _set_activations_checkpoint_params(
     max_layers_per_pipe = num_layers
     interval_layers_per_pipe = act_multiple
     if (
-        model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"] and pp > 2
+        model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"]
+        and pp > 2
     ):  # Interleaved pipeline scheduling.
         virtual_pipelines = (
             num_layers // pp
@@ -246,7 +256,14 @@ def _set_activations_checkpoint_params(
                 0, multiplier * num_layers // pp // virtual_pipelines + 1, act_multiple
             )
 
-        if pp > 1 and model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"]:
+        if pp > 1 and model_name in [
+            "gpt3",
+            "bert",
+            "llama",
+            "baichuan2",
+            "chatglm",
+            "qwen2",
+        ]:
             # Num micro batches with partial act ckpt
             num_micro_batches_partial_act_ckpt = list(
                 range(min_micro_b, max_micro_b + 1, interval_micro_b)
@@ -824,14 +841,18 @@ def _calculate_tp_pp_mbs_grid(
     gpu_memory_gb = train_cfg.get("gpu_memory_gb")
 
     multiplier = (
-        1 if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"] else 2
+        1
+        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"]
+        else 2
+    )
+    init_pp = (
+        [] if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2"] else [1]
     )
-    init_pp = [] if model_name in ["gpt3", "llama", "baichuan2", "chatglm"] else [1]
     valid_pp = init_pp + [
         multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0
     ]  # Only divisors of num_layers are possible.
 
-    if model_name in ["gpt3", "llama", "baichuan2", "chatglm"]:
+    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2"]:
         if gpu_memory_gb == 80:
             (
                 tp,
diff --git a/auto_configurator/autoconfig/utils.py b/auto_configurator/autoconfig/utils.py
index c53f71021e..c8d366e1f2 100644
--- a/auto_configurator/autoconfig/utils.py
+++ b/auto_configurator/autoconfig/utils.py
@@ -45,7 +45,7 @@ def _calculate_model_size(
     :rtype: float
     :raises NotImplementedError: if the model name is not valid.
     """
-    if model_name in ["gpt3", "llama", "baichuan2", "chatglm"]:
+    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2"]:
         model_size = (
             12
             * num_layers
@@ -113,7 +113,7 @@ def calculate_model_size_params(
     :raises NotImplementedError: if the model name is not supported.
     """
     ffn, kv = None, None  # Only needed for some models.
-    if model_name in ["gpt3", "llama", "baichuan2", "chatglm"]:
+    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2"]:
         if model_size_in_b < 0.25:
             hs, att_h, lr = 768, 12, 6e-4
         elif model_size_in_b < 0.5:
@@ -395,7 +395,7 @@ def modify_cfg(
     """
     new_cfg = copy.deepcopy(base_cfg)
     if act is not None:
-        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"]:
+        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"]:
             new_cfg["model"]["activations_checkpoint_num_layers"] = act
         else:
             new_cfg["model"]["encoder"]["activations_checkpoint_num_layers"] = act // 2
@@ -407,6 +407,7 @@ def modify_cfg(
         "llama",
         "baichuan2",
         "chatglm",
+        "qwen2",
     ]:
         new_cfg["model"][
             "num_micro_batches_with_partial_activation_checkpoints"
@@ -418,6 +419,7 @@ def modify_cfg(
         "llama",
         "baichuan2",
         "chatglm",
+        "qwen2",
     ]:
         new_cfg["model"]["activations_checkpoint_layers_per_pipeline"] = act_per_pipe
 
@@ -427,6 +429,7 @@ def modify_cfg(
         "llama",
         "baichuan2",
         "chatglm",
+        "qwen2",
     ]:
         new_cfg["model"]["virtual_pipeline_model_parallel_size"] = virtual_pipelines
 
@@ -434,7 +437,7 @@ def modify_cfg(
     new_cfg["model"]["pipeline_model_parallel_size"] = pp
     new_cfg["model"]["micro_batch_size"] = mbs
 
-    if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"]:
+    if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"]:
         att_heads = new_cfg["model"]["num_attention_heads"]
         num_layers = new_cfg["model"]["num_layers"]
     else:
diff --git a/auto_configurator/base_configs/qwen2_14b.yaml b/auto_configurator/base_configs/qwen2_14b.yaml
new file mode 100644
index 0000000000..dd6beb9d8a
--- /dev/null
+++ b/auto_configurator/base_configs/qwen2_14b.yaml
@@ -0,0 +1,235 @@
+run:
+  name: qwen2_14b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-01:00:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 1
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: false
+  enable_model_summary: false
+
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_qwen2
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_qwen2_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 32768
+  max_position_embeddings: 32768
+  num_layers: 40
+  hidden_size: 5120
+  ffn_hidden_size: 13696
+  num_attention_heads: 40
+  num_query_groups: 40
+  override_vocab_size: 152064
+  rotary_base: 1000000.0
+  init_method_std: 0.02
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  qkv_bias: true
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+
+  tokenizer:
+    library: 'huggingface'
+    type: Qwen/Qwen1.5-14B
+    model: null # /path/to/tokenizer.model
+    vocab_file: null
+    merge_file: null 
+    delimiter: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  get_attention_mask_from_fusion: true
+  apply_rope_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: true
+
+  ## Transformer Engine
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: true # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: false # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to false.
+  overlap_p2p_comm: false
+  batch_p2p_comm: true
+  ub_tp_comm_overlap: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: 32768
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    validation_drop_last: true
+    no_seqlen_plus_one_input_tokens: false
+    pad_samples_to_global_batch_size: false
+    shuffle_documents: true
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-qwen2_00_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_01_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_02_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_03_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_04_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_05_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_06_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_07_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_08_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_09_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_10_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_11_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_12_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_13_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_14_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_15_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_16_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_17_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_18_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_19_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_20_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_21_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_22_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_23_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_24_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_25_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_26_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_27_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_28_text_document
+    - .0334
+    - ${data_dir}/my-qwen2_29_text_document
diff --git a/auto_configurator/base_configs/qwen2_4b.yaml b/auto_configurator/base_configs/qwen2_4b.yaml
new file mode 100644
index 0000000000..b2268b32e7
--- /dev/null
+++ b/auto_configurator/base_configs/qwen2_4b.yaml
@@ -0,0 +1,235 @@
+run:
+  name: qwen2_4b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-01:00:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 1
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: false
+  enable_model_summary: false
+
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_qwen2
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_qwen2_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 2
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 32768
+  max_position_embeddings: 32768
+  num_layers: 40
+  hidden_size: 2560
+  ffn_hidden_size: 6912
+  num_attention_heads: 20
+  init_method_std: 0.02
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  qkv_bias: true
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  num_query_groups: 20
+  override_vocab_size: 151936
+  rotary_base: 5000000.0
+
+  tokenizer:
+    library: 'huggingface'
+    type: Qwen/Qwen1.5-4B
+    model: null # /path/to/tokenizer.model
+    vocab_file: null
+    merge_file: null 
+    delimiter: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  get_attention_mask_from_fusion: true
+  apply_rope_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false
+
+  ## Transformer Engine
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: true # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: false # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to false.
+  overlap_p2p_comm: false
+  batch_p2p_comm: true
+  ub_tp_comm_overlap: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: 32768
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    validation_drop_last: true
+    no_seqlen_plus_one_input_tokens: false
+    pad_samples_to_global_batch_size: false
+    shuffle_documents: true
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-qwen2_00_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_01_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_02_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_03_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_04_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_05_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_06_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_07_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_08_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_09_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_10_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_11_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_12_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_13_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_14_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_15_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_16_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_17_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_18_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_19_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_20_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_21_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_22_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_23_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_24_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_25_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_26_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_27_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_28_text_document
+    - .0334
+    - ${data_dir}/my-qwen2_29_text_document
diff --git a/auto_configurator/base_configs/qwen2_72b.yaml b/auto_configurator/base_configs/qwen2_72b.yaml
new file mode 100644
index 0000000000..0f01f0b423
--- /dev/null
+++ b/auto_configurator/base_configs/qwen2_72b.yaml
@@ -0,0 +1,235 @@
+run:
+  name: qwen2_72b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-01:00:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 1
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: false
+  enable_model_summary: false
+
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_qwen2
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_qwen2_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 8
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 32768
+  max_position_embeddings: 32768
+  num_layers: 80
+  hidden_size: 8192
+  ffn_hidden_size: 24576
+  num_attention_heads: 64
+  num_query_groups: 64
+  override_vocab_size: 152064
+  rotary_base: 1000000.0
+  init_method_std: 0.02
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  qkv_bias: true
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+
+  tokenizer:
+    library: 'huggingface'
+    type: Qwen/Qwen1.5-72B
+    model: null # /path/to/tokenizer.model
+    vocab_file: null
+    merge_file: null 
+    delimiter: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  get_attention_mask_from_fusion: true
+  apply_rope_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: true
+
+  ## Transformer Engine
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: true # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: false # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to false.
+  overlap_p2p_comm: false
+  batch_p2p_comm: true
+  ub_tp_comm_overlap: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: 32768
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    validation_drop_last: true
+    no_seqlen_plus_one_input_tokens: false
+    pad_samples_to_global_batch_size: false
+    shuffle_documents: true
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-qwen2_00_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_01_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_02_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_03_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_04_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_05_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_06_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_07_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_08_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_09_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_10_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_11_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_12_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_13_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_14_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_15_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_16_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_17_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_18_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_19_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_20_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_21_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_22_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_23_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_24_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_25_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_26_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_27_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_28_text_document
+    - .0334
+    - ${data_dir}/my-qwen2_29_text_document
diff --git a/auto_configurator/base_configs/qwen2_7b.yaml b/auto_configurator/base_configs/qwen2_7b.yaml
new file mode 100644
index 0000000000..cfb67e2934
--- /dev/null
+++ b/auto_configurator/base_configs/qwen2_7b.yaml
@@ -0,0 +1,235 @@
+run:
+  name: qwen2_7b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-01:00:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 1
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: false
+  enable_model_summary: false
+
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_qwen2
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_qwen2_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 32768
+  max_position_embeddings: 32768
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 11008
+  num_attention_heads: 32
+  num_query_groups: 32
+  override_vocab_size: 151936
+  rotary_base: 1000000.0
+  init_method_std: 0.02
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  qkv_bias: true
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+
+  tokenizer:
+    library: 'huggingface'
+    type: Qwen/Qwen1.5-7B
+    model: null # /path/to/tokenizer.model
+    vocab_file: null
+    merge_file: null 
+    delimiter: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  get_attention_mask_from_fusion: true
+  apply_rope_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: true
+
+  ## Transformer Engine
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: true # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: false # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to false.
+  overlap_p2p_comm: false
+  batch_p2p_comm: true
+  ub_tp_comm_overlap: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: 32768
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    validation_drop_last: true
+    no_seqlen_plus_one_input_tokens: false
+    pad_samples_to_global_batch_size: false
+    shuffle_documents: true
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-qwen2_00_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_01_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_02_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_03_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_04_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_05_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_06_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_07_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_08_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_09_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_10_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_11_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_12_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_13_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_14_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_15_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_16_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_17_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_18_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_19_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_20_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_21_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_22_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_23_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_24_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_25_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_26_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_27_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_28_text_document
+    - .0334
+    - ${data_dir}/my-qwen2_29_text_document
diff --git a/auto_configurator/conf/search_config/qwen2/14b.yaml b/auto_configurator/conf/search_config/qwen2/14b.yaml
new file mode 100644
index 0000000000..050ff98bf3
--- /dev/null
+++ b/auto_configurator/conf/search_config/qwen2/14b.yaml
@@ -0,0 +1,23 @@
+train_settings:
+  model_size_in_b: 14 # unit in billion parameters
+  num_nodes: 2
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 152064
+  seq_length: 32768 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: ${auto_configurator_path}/base_configs/qwen2_14b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
+ 
diff --git a/auto_configurator/conf/search_config/qwen2/4b.yaml b/auto_configurator/conf/search_config/qwen2/4b.yaml
new file mode 100644
index 0000000000..4945230665
--- /dev/null
+++ b/auto_configurator/conf/search_config/qwen2/4b.yaml
@@ -0,0 +1,22 @@
+train_settings:
+  model_size_in_b: 4 # unit in billion parameters
+  num_nodes: 1
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 151936
+  seq_length: 32768 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: ${auto_configurator_path}/base_configs/qwen2_4b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
diff --git a/auto_configurator/conf/search_config/qwen2/72b.yaml b/auto_configurator/conf/search_config/qwen2/72b.yaml
new file mode 100644
index 0000000000..4e5ff55bae
--- /dev/null
+++ b/auto_configurator/conf/search_config/qwen2/72b.yaml
@@ -0,0 +1,23 @@
+train_settings:
+  model_size_in_b: 72 # unit in billion parameters
+  num_nodes: 8
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 152064
+  seq_length: 32768 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: ${auto_configurator_path}/base_configs/qwen2_72b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
+ 
diff --git a/auto_configurator/conf/search_config/qwen2/7b.yaml b/auto_configurator/conf/search_config/qwen2/7b.yaml
new file mode 100644
index 0000000000..ac32a47292
--- /dev/null
+++ b/auto_configurator/conf/search_config/qwen2/7b.yaml
@@ -0,0 +1,22 @@
+train_settings:
+  model_size_in_b: 7 # unit in billion parameters
+  num_nodes: 1
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 151936
+  seq_length: 32768 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: ${auto_configurator_path}/base_configs/qwen2_7b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
diff --git a/launcher_scripts/conf/conversion/qwen2/convert_qwen2.yaml b/launcher_scripts/conf/conversion/qwen2/convert_qwen2.yaml
new file mode 100644
index 0000000000..ab670c4f50
--- /dev/null
+++ b/launcher_scripts/conf/conversion/qwen2/convert_qwen2.yaml
@@ -0,0 +1,22 @@
+run:
+  name: convert_${conversion.run.model_train_name}
+  nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
+  time_limit: "1:00:00"
+  dependency: "singleton"
+  ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
+  convert_name: convert_nemo
+  model_train_name: qwen2_7b
+  train_dir: ${base_results_dir}/${.model_train_name}
+  results_dir: ${.train_dir}/${.convert_name}
+  nemo_file_name: megatron_qwen2.nemo # name of nemo checkpoint; must be .nemo file
+  pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory
+
+model:
+  model_type: gpt # gpt or t5, use t5 for mt5 as well
+  checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
+  checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_qwen2-*last.ckpt)
+  hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  tokenizer_model: ${data_dir}/qwen2/qwen2_tokenizer.model
diff --git a/launcher_scripts/conf/evaluation/qwen2/evaluate_all.yaml b/launcher_scripts/conf/evaluation/qwen2/evaluate_all.yaml
new file mode 100644
index 0000000000..4ba8dea87c
--- /dev/null
+++ b/launcher_scripts/conf/evaluation/qwen2/evaluate_all.yaml
@@ -0,0 +1,24 @@
+run:
+  name: ${.eval_name}_${.model_train_name}
+  time_limit: "02:00:00"
+  dependency: "singleton"
+  nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
+  ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
+  eval_name: eval_all
+  model_train_name: qwen2_7b
+  train_dir: ${base_results_dir}/${.model_train_name}
+  tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
+  results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
+
+model:
+  model_type: nemo-qwen2
+  nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
+  #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
+  #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
+  #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  precision: bf16 # must match training precision - 32, 16 or bf16
+  eval_batch_size: 4
+  #tokenizer_model: ${data_dir}/qwen2/qwen2_tokenizer.model 
diff --git a/launcher_scripts/conf/evaluation/qwen2/evaluate_boolq.yaml b/launcher_scripts/conf/evaluation/qwen2/evaluate_boolq.yaml
new file mode 100644
index 0000000000..49ba25236c
--- /dev/null
+++ b/launcher_scripts/conf/evaluation/qwen2/evaluate_boolq.yaml
@@ -0,0 +1,24 @@
+run:
+  name: ${.eval_name}_${.model_train_name}
+  time_limit: "02:00:00"
+  dependency: "singleton"
+  nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
+  ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
+  eval_name: eval_boolq
+  model_train_name: llama2_7b
+  train_dir: ${base_results_dir}/${.model_train_name}
+  tasks: boolq  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
+  results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
+
+model:
+  model_type: nemo-llama
+  nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
+  #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
+  #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
+  #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  precision: bf16 # must match training precision - 32, 16 or bf16
+  eval_batch_size: 4
+  #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 
diff --git a/launcher_scripts/conf/fine_tuning/qwen2/squad.yaml b/launcher_scripts/conf/fine_tuning/qwen2/squad.yaml
new file mode 100644
index 0000000000..66df27b43c
--- /dev/null
+++ b/launcher_scripts/conf/fine_tuning/qwen2/squad.yaml
@@ -0,0 +1,194 @@
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: qwen2_7b
+  convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name}
+  task_name: "squad"  # Rename this name to be more clear
+  results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name}
+
+trainer:
+  devices: 8
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 13000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 300 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: ${fine_tuning.run.results_dir}/results
+  exp_dir: null
+  name: megatron_qwen2_${fine_tuning.run.task_name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: nemo_qwen2_${fine_tuning.run.task_name}
+    name: ${fine_tuning.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${fine_tuning.model.data.validation_ds.metric.name}
+    save_top_k: 5
+    mode: min
+    save_nemo_on_train_end: True
+    filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${fine_tuning.model.tensor_model_parallel_size}, ${fine_tuning.model.pipeline_model_parallel_size}}
+    save_best_model: True
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  global_batch_size: 32
+  micro_batch_size: 4
+  restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_qwen2.nemo # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: True 
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  answer_only_loss: True  # not used right now
+  gradient_as_bucket_view: False
+  seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
+  use_flash_attention: True # if not None, will match the base model's value
+
+  hidden_dropout: 0.1
+  attention_dropout: 0.1
+  ffn_dropout: 0.1
+
+  # FSDP
+  fsdp: False # Enable training with torch FSDP.
+  fsdp_sharding_strategy: 'full' # Method to shard model states. Available options are 'full', 'hybrid', and 'grad'.
+  fsdp_grad_reduce_dtype: 'bf16' # Gradient reduction data type.
+  fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
+  fsdp_use_orig_params: False # Set to True to use FSDP for specific peft scheme.
+
+  data:
+    chat: False # whether use chatbot data or not
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names: 
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: 
+      - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: 4096
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: 
+      - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      separate_prompt_and_response_with_newline: True
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+    validation_ds:
+      file_names:
+      - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json  # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: 
+      - ${fine_tuning.run.task_name} # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length}
+      min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length}
+      drop_last: True
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${fine_tuning.model.data.train_ds.add_eos}
+      add_sep: ${fine_tuning.model.data.train_ds.add_sep}
+      add_bos: ${fine_tuning.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    test_ds:
+      file_names: ${fine_tuning.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length}
+      min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length}
+      drop_last: True
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${fine_tuning.model.data.train_ds.add_eos}
+      add_sep: ${fine_tuning.model.data.train_ds.add_sep}
+      add_bos: ${fine_tuning.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: distributed_fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
+    lr: 1e-6
+    weight_decay: 0.1 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      monitor: validation_${fine_tuning.model.data.validation_ds.metric.name}
+      min_lr: 1e-8
+      warmup_steps: 1000
+      last_epoch: -1
+
+
diff --git a/launcher_scripts/conf/peft/qwen2/sft.yaml b/launcher_scripts/conf/peft/qwen2/sft.yaml
new file mode 100644
index 0000000000..09cb2b3430
--- /dev/null
+++ b/launcher_scripts/conf/peft/qwen2/sft.yaml
@@ -0,0 +1,263 @@
+defaults:
+  - _self_
+  - optional tp_overlap@model.ub_tp_comm_overlap_cfg:
+
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
+run:
+  name: sft_${.model_train_name}
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 04:00:00
+  dependency: singleton
+  convert_name: convert_nemo
+  model_train_name: qwen2
+  convert_dir: ${base_results_dir}/${.model_train_name}/${.convert_name}
+
+trainer:
+  devices: 8
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: ${peft.run.results_dir}/results
+  exp_dir: null
+  name: ${peft.run.name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${peft.model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${peft.exp_manager.name}--{${.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${peft.model.tensor_model_parallel_size}, ${peft.model.pipeline_model_parallel_size}}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+  create_tensorboard_logger: False
+  log_global_rank_0_only: True
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null
+
+  encoder_seq_length: ${peft.model.data.train_ds.max_seq_length}
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ${peft.run.convert_dir}/results/megatron_qwen2.nemo # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  sync_batch_comm: False
+  overlap_p2p_comm: False
+  ub_tp_comm_overlap: False
+  tp_comm_atomic_ag: False
+  tp_comm_atomic_rs: False
+
+  megatron_amp_O2: True
+  mcore_gpt: True
+  transformer_engine: False
+
+  get_attention_mask_from_fusion: True
+  apply_rope_fusion: True
+  bias_activation_fusion: True
+  bias_dropout_add_fusion: True
+  masked_softmax_fusion: True
+  gradient_accumulation_fusion: True
+  grad_div_ar_fusion: True
+
+  fp8: False
+  fp8_e4m3: False
+  fp8_hybrid: True
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 512
+  fp8_amax_compute_algo: max
+  fp8_wgrad: True
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: False
+  gradient_as_bucket_view: True
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: null  # null (SFT, no PEFT), ptuning, lora
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: True
+      num_workers: 2
+      memmap_workers: 2
+      pin_memory: True
+      packed_sequence: False # Set to true to load with GPTSFTPackedDataset
+      pad_to_max_length: False
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: False
+      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
+    validation_ds:
+      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: False
+      num_workers: ${peft.model.data.train_ds.num_workers}
+      memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: ${peft.model.data.train_ds.max_seq_length}
+      min_seq_length: ${peft.model.data.train_ds.min_seq_length}
+      drop_last: False
+      label_key: ${peft.model.data.train_ds.label_key}
+      add_eos: ${peft.model.data.train_ds.add_eos}
+      add_sep: ${peft.model.data.train_ds.add_sep}
+      add_bos: ${peft.model.data.train_ds.add_bos}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: ${peft.model.data.train_ds.truncation_field} # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+    test_ds:
+      file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: False
+      num_workers: ${peft.model.data.train_ds.num_workers}
+      memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: ${peft.model.data.train_ds.max_seq_length}
+      min_seq_length: ${peft.model.data.train_ds.min_seq_length}
+      drop_last: False
+      label_key: ${peft.model.data.train_ds.label_key}
+      add_eos: ${peft.model.data.train_ds.add_eos}
+      add_sep: ${peft.model.data.train_ds.add_sep}
+      add_bos: ${peft.model.data.train_ds.add_bos}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: ${peft.model.data.train_ds.truncation_field} # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${peft.model.data.train_ds.prompt_template}
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    bucket_cap_mb: 50
+    dtype: bf16 # fp32 | bf16
+    grad_sync_dtype: bf16 # fp32 | bf16
+    overlap_grad_sync: True
+    overlap_param_sync: True
+    contiguous_grad_buffer: True
+    contiguous_param_buffer: True
+
diff --git a/launcher_scripts/conf/peft/qwen2/squad.yaml b/launcher_scripts/conf/peft/qwen2/squad.yaml
new file mode 100644
index 0000000000..059fa78715
--- /dev/null
+++ b/launcher_scripts/conf/peft/qwen2/squad.yaml
@@ -0,0 +1,242 @@
+name: megatron_llama_peft_tuning-${peft.model.peft.peft_scheme}
+
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: llama2_7b
+  convert_dir: ${base_results_dir}/${.model_train_name}/${.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/peft_${.name}
+
+trainer:
+  devices: 8
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: ${peft.run.results_dir}/results
+  exp_dir: null
+  name: ${peft.name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: nemo_llama2_${peft.run.task_name}
+    name: ${peft.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${peft.model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${peft.name}--{${peft.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${peft.model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ${peft.run.convert_dir}/results/megatron_llama.nemo # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+  mcore_gpt: True
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  # FSDP
+  fsdp: False # Enable training with torch FSDP.
+  fsdp_sharding_strategy: 'full' # Method to shard model states. Available options are 'full', 'hybrid', and 'grad'.
+  fsdp_grad_reduce_dtype: 'bf16' # Gradient reduction data type.
+  fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
+  fsdp_use_orig_params: False # Set to True to use FSDP for specific peft scheme.
+
+  peft:
+    peft_scheme: "lora"  # can be either adapter, ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names: 
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names:
+      - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 4096
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities:
+      - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+    validation_ds:
+      file_names: 
+      - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names:
+      - ${peft.run.task_name} # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: ${peft.model.data.train_ds.max_seq_length}
+      min_seq_length: ${peft.model.data.train_ds.min_seq_length}
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${peft.model.data.train_ds.add_eos}
+      add_sep: ${peft.model.data.train_ds.add_sep}
+      add_bos: ${peft.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+    test_ds:
+        file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${peft.model.global_batch_size}
+        micro_batch_size: ${peft.model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: ${peft.model.data.train_ds.max_seq_length}
+        min_seq_length: ${peft.model.data.train_ds.min_seq_length}
+        drop_last: False
+        context_key: 'input'
+        label_key: 'output'
+        add_eos: ${peft.model.data.train_ds.add_eos}
+        add_sep: ${peft.model.data.train_ds.add_sep}
+        add_bos: ${peft.model.data.train_ds.add_bos}
+        separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: "context" # Options: ['context', 'answer']
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${peft.model.data.train_ds.prompt_template}
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml
new file mode 100644
index 0000000000..70d02245b0
--- /dev/null
+++ b/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml
@@ -0,0 +1,231 @@
+defaults:
+  - _self_
+  - optional tp_overlap@model.ub_tp_comm_overlap_cfg: ub_cfg_h100_h5120_tp2_mbs1_seqlen4096
+
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
+run:
+  name: qwen2_13b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 32
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 300000
+  max_time: '5:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_qwen2
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: nemo_qwen2_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: true
+    filename: megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size},
+      ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 32768
+  max_position_embeddings: 32768
+  num_layers: 40
+  hidden_size: 5120
+  ffn_hidden_size: 13696
+  num_attention_heads: 40
+  num_query_groups: 40
+  override_vocab_size: 152064
+  rotary_base: 1000000.0
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  qkv_bias: true
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  apply_rope_fusion: true
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: 'huggingface'
+    type: Qwen/Qwen1.5-14B
+    model: null # /path/to/tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: true
+  tp_comm_atomic_ag: False
+  tp_comm_atomic_rs: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true    
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 107
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 32768
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-qwen2_00_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_01_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_02_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_03_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_04_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_05_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_06_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_07_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_08_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_09_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_10_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_11_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_12_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_13_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_14_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_15_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_16_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_17_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_18_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_19_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_20_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_21_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_22_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_23_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_24_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_25_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_26_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_27_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_28_text_document
+    - .0334
+    - ${data_dir}/my-qwen2_29_text_document
diff --git a/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml
new file mode 100644
index 0000000000..6217eb0145
--- /dev/null
+++ b/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml
@@ -0,0 +1,234 @@
+defaults:
+  - _self_
+  - optional tp_overlap@model.ub_tp_comm_overlap_cfg:
+
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
+run:
+  name: qwen2_4b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-01:30:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 16
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_qwen2
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: nemo_qwen2_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: True
+  step_timing_kwargs:
+    sync_cuda: True
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 32768
+  max_position_embeddings: 32768
+  num_layers: 40
+  hidden_size: 2560
+  ffn_hidden_size: 6912
+  num_attention_heads: 20
+  num_query_groups: 20
+  override_vocab_size: 151936
+  rotary_base: 5000000.0
+  init_method_std: 0.02
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  qkv_bias: true
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  apply_rope_fusion: true
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: 'huggingface'
+    type: Qwen/Qwen1.5-4B
+    model: null
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: False
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false
+
+  ## Transformer Engine
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False
+  ub_tp_comm_overlap: False
+  tp_comm_atomic_ag: False
+  tp_comm_atomic_rs: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: "99990,8,2"
+    seq_length: 32768
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-qwen2_00_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_01_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_02_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_03_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_04_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_05_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_06_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_07_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_08_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_09_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_10_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_11_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_12_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_13_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_14_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_15_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_16_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_17_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_18_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_19_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_20_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_21_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_22_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_23_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_24_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_25_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_26_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_27_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_28_text_document
+    - .0334
+    - ${data_dir}/my-qwen2_29_text_document
+
diff --git a/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml
new file mode 100644
index 0000000000..0ba8267c42
--- /dev/null
+++ b/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml
@@ -0,0 +1,234 @@
+defaults:
+  - _self_
+  - optional tp_overlap@model.ub_tp_comm_overlap_cfg: ub_cfg_h100_h8192_tp4_mbs1_seqlen4096
+
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
+run:
+  name: qwen2_72b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 128
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000
+  max_time: '19:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_qwen2
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: nemo_qwen2_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: true
+    filename: megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 4
+  virtual_pipeline_model_parallel_size: 20
+  encoder_seq_length: 32768
+  max_position_embeddings: 32768
+  num_layers: 80
+  hidden_size: 8192
+  ffn_hidden_size: 24576
+  num_attention_heads: 64
+  num_query_groups: 64
+  override_vocab_size: 152064
+  rotary_base: 1000000.0
+  init_method_std: 0.02
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  qkv_bias: true
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  apply_rope_fusion: true
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: 'huggingface'
+    type: Qwen/Qwen1.5-72B
+    model: null # /path/to/tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: true
+  tp_comm_atomic_ag: False
+  tp_comm_atomic_rs: False
+  use_flash_attention: true
+  overlap_p2p_comm: true
+  batch_p2p_comm: false
+  gc_interval: 3
+  optim:
+    name: distributed_fused_adam
+    lr: 0.00015
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 32768
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-qwen2_00_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_01_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_02_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_03_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_04_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_05_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_06_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_07_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_08_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_09_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_10_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_11_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_12_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_13_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_14_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_15_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_16_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_17_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_18_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_19_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_20_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_21_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_22_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_23_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_24_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_25_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_26_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_27_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_28_text_document
+    - .0334
+    - ${data_dir}/my-qwen2_29_text_document
+
diff --git a/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml
new file mode 100644
index 0000000000..98bd353954
--- /dev/null
+++ b/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml
@@ -0,0 +1,234 @@
+defaults:
+  - _self_
+  - optional tp_overlap@model.ub_tp_comm_overlap_cfg:
+
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
+run:
+  name: qwen2_7b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-01:30:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 16
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_qwen2
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: nemo_qwen2_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: True
+  step_timing_kwargs:
+    sync_cuda: True
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 32768
+  max_position_embeddings: 32768
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 11008
+  num_attention_heads: 32
+  num_query_groups: 32
+  override_vocab_size: 151936
+  rotary_base: 1000000.0
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  qkv_bias: true
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  apply_rope_fusion: true
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: 'huggingface'
+    type: Qwen/Qwen1.5-7B
+    model: null # /path/to/tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: False
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: true
+
+  ## Transformer Engine
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False
+  ub_tp_comm_overlap: False
+  tp_comm_atomic_ag: False
+  tp_comm_atomic_rs: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true    
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: "99990,8,2"
+    seq_length: 32768
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-qwen2_00_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_01_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_02_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_03_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_04_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_05_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_06_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_07_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_08_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_09_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_10_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_11_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_12_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_13_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_14_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_15_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_16_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_17_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_18_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_19_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_20_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_21_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_22_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_23_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_24_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_25_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_26_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_27_text_document
+    - .0333
+    - ${data_dir}/my-qwen2_28_text_document
+    - .0334
+    - ${data_dir}/my-qwen2_29_text_document
+
diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py
index 60f524c884..78f2a35780 100755
--- a/launcher_scripts/main.py
+++ b/launcher_scripts/main.py
@@ -90,6 +90,7 @@
             "chatglm",
             "mistral",
             "mixtral",
+            "qwen2",
         ],
         NeMoEvaluation: [
             "t5",
@@ -111,6 +112,7 @@
             "starcoder2",
             "peft_mistral",
             "peft_mixtral",
+            "peft_qwen2",
         ],
         DiffusionModelEvaluation: ["stable_diffusion", "imagen"],
     },
@@ -124,6 +126,7 @@
             "falcon",
             "baichuan2",
             "chatglm",
+            "qwen2",
         ],
         MC4DataPreparation: ["mt5"],
         SteerLMDataPreparation: ["steerlm"],
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py
index 46a264649e..cb1a4e681a 100755
--- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py
@@ -25,6 +25,7 @@
     nemo_chatglm,
     nemo_mistral,
     nemo_mixtral,
+    nemo_qwen2,
 )
 
 MODEL_REGISTRY = {
@@ -37,6 +38,7 @@
     "nemo-falcon": nemo_falcon.NeMo_FalconLM_TP_PP,
     "nemo-mistral": nemo_mistral.NeMo_MISTRAL_TP_PP,
     "nemo-mixtral": nemo_mixtral.NeMo_MIXTRAL_TP_PP,
+    "nemo-qwen2": nemo_qwen2.NeMo_QWEN2_TP_PP,
     "dummy": dummy.DummyLM,
 }
 
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_qwen2.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_qwen2.py
new file mode 100644
index 0000000000..7eb7b7c67b
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_qwen2.py
@@ -0,0 +1,261 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+import tqdm
+from lm_eval import utils
+from lm_eval.base import LM
+from megatron.core import parallel_state
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import (
+    MegatronGPTModel,
+)
+from nemo.collections.nlp.modules.common.megatron.megatron_init import (
+    fake_initialize_model_parallel,
+)
+from nemo.collections.nlp.modules.common.text_generation_utils import (
+    generate,
+    get_computeprob_response,
+)
+from nemo.collections.nlp.parts.nlp_overrides import (
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+)
+from nemo.utils import logging
+from nemo.utils.app_state import AppState
+from nemo.utils.get_rank import is_global_rank_zero
+from nemo.utils.model_utils import inject_model_parallel_rank
+from omegaconf import OmegaConf, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+
+from .nemo_gpt3 import DDP_initialize, RequestDataset, setup_trainer_and_model
+
+
+class NeMo_QWEN2_TP_PP(LM):
+    def __init__(self, args, truncate=False, batch_size=1):
+        super().__init__()
+
+        # get nemo megatron
+        logging.info(f"**** Building Qwen2 model ...")
+        self.trainer, self.model = setup_trainer_and_model(args)
+        self.tokenizer = self.model.tokenizer
+        self.model.eval()
+
+        self.max_length = self.model.cfg.get("max_position_embeddings")
+
+        self.truncate = truncate
+        self.batch_size = batch_size
+
+        # initialize DDP and move model to GPU
+        DDP_initialize(self.model)
+        self.model = self.model.cuda()
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config={}):
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(args, **args2)
+
+    def loglikelihood(self, requests):
+        return self._loglikelihood(requests)
+
+    """
+    request: (context, continuation)
+    how this all works:
+             CTX      CONT
+    inp    0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
+    gpt2    \               \
+    logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
+    cont_toks      4 5 6 7 8 9
+    when too long to fit in context, truncate from the left
+    """
+
+    def _loglikelihood(self, requests):
+        def pad_collate(batch, eos_id=2):
+            tokens = [item[0] for item in batch]
+            conti_lens = [item[1] for item in batch]
+            lens = [
+                len(token) - 1 for token in tokens
+            ]  # fake delete last token by reducing input len
+            max_len = max(lens)
+            extra_pad_len = 0
+            if max_len % 8 != 0:
+                extra_pad_len = 8 - (max_len % 8)
+                max_len += extra_pad_len
+            # extra_pad_len = 2048 - max_len
+            # max_len += extra_pad_len
+
+            tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=eos_id)
+            if extra_pad_len > 0:
+                extra_pad = torch.ones(extra_pad_len, len(batch)) * eos_id
+                extra_pad = extra_pad.type_as(tokens_pad)
+                tokens_pad = torch.vstack((tokens_pad, extra_pad))
+            # Add padding to all samples to adapt nemo generate api
+
+            new_batch = []
+            for token, lenn, conti_len in zip(tokens_pad.T, lens, conti_lens):
+                # (token, lenn, tokens_to_generate, compute_logprobs)
+                new_batch.append((token, max_len, lenn, conti_len))
+
+            new_batch = default_collate(new_batch)
+            return new_batch
+
+        def _collate(x):  # used to reorder request and remove duplications
+            """
+              the negative sign on len(toks) sorts descending - this has a few advantages:
+              - time estimates will always be over not underestimates, which is more useful for planning
+              - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
+                this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
+              - any OOMs will happen right away rather than near the end
+            """
+            toks = x[0] + x[1]
+            return -len(toks), tuple(toks)
+
+        reord = utils.Reorderer(requests, _collate)
+        request_ds = RequestDataset(
+            reord.get_reordered(), self.model.tokenizer, self.max_length
+        )
+        request_dl = DataLoader(
+            request_ds,
+            collate_fn=pad_collate,
+            batch_size=self.batch_size,
+            shuffle=False,
+        )
+
+        def logits_to_results(batch, response):
+            input_token_ids_batch, _, lens, conti_lens = batch
+            batch_size = len(lens)
+            assert (
+                len(response["token_ids"]) == batch_size
+            ), "Response's length not equal to batch size."
+
+            batch_res = []
+            for index in range(batch_size):
+                inp_len = lens[index]
+                conti_len = conti_lens[index]
+
+                inp_token_ids = input_token_ids_batch[index].tolist()[
+                    : inp_len + 1
+                ]  # recover fake deleted token
+                response_token_ids = response["token_ids"][index][:inp_len]
+
+                assert (
+                    response_token_ids == inp_token_ids[:-1]
+                ), f"Mismatch in input tokens."
+
+                log_probs = response["full_logprob"][index][:inp_len]  # torch.tensor
+                log_probs = log_probs[-conti_len:]
+
+                greedy_tokens = log_probs.argmax(dim=-1)
+                greedy_tokens = self.tokenizer.ids_to_tokens(
+                    greedy_tokens.cpu().numpy().tolist()
+                )
+
+                conti_token_ids = inp_token_ids[-conti_len:]
+                conti_tokens = self.tokenizer.ids_to_tokens(conti_token_ids)
+
+                max_equal = greedy_tokens == conti_tokens
+                log_probs = log_probs.cpu().to(torch.float32)
+                conti_enc = torch.tensor(self.tokenizer.tokens_to_ids(conti_tokens))
+                conti_probs = torch.gather(
+                    log_probs, 1, conti_enc.unsqueeze(-1)
+                ).squeeze(-1)
+
+                batch_res.append(
+                    (
+                        float(conti_probs.sum()),
+                        bool(max_equal),
+                        greedy_tokens,
+                        conti_tokens,
+                    )
+                )
+            return batch_res
+
+        res = []
+        for batch in tqdm.tqdm(request_dl):
+            # inputs = (token_ids, conti_lens)
+            inputs = (batch[0].cuda(), batch[1].cuda())
+            response = generate(
+                model=self.model,
+                inputs=inputs,
+                tokens_to_generate=1,
+                all_probs=True,
+                temperature=1.0,
+                add_BOS=False,
+                top_k=0,
+                top_p=0.9,
+                greedy=True,
+                repetition_penalty=1.0,
+                min_tokens_to_generate=0,
+                compute_logprob=True,
+                end_strings=["</s>"],
+            )
+            response = get_computeprob_response(self.tokenizer, response, inputs)
+
+            if is_global_rank_zero():
+                res.extend(logits_to_results(batch, response))
+
+            del inputs, response
+
+        return reord.get_original(res) if self.can_access_output() else None
+
+    def loglikelihood_rolling(self, requests):
+        loglikelihoods = []
+        len_rolling_token_windows = [0]
+        all_rolling_token_windows = []
+
+        for (string,) in requests:
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tokenizer.text_to_ids(string),
+                        prefix_token=2,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            len_rolling_token_windows.append(
+                len(rolling_token_windows) + len_rolling_token_windows[-1]
+            )
+            all_rolling_token_windows.extend(rolling_token_windows)
+
+        string_nll = self._loglikelihood(all_rolling_token_windows)
+        if self.can_access_output():
+            string_nll = [x[0] for x in string_nll]
+            # discard is_greedy
+            for i in range(len(len_rolling_token_windows) - 1):
+                loglikelihoods.append(
+                    sum(
+                        string_nll[
+                            len_rolling_token_windows[i] : len_rolling_token_windows[
+                                i + 1
+                            ]
+                        ]
+                    )
+                )
+
+        return loglikelihoods
+
+    def greedy_until(self, requests):
+        raise NotImplementedError
+
+    def can_access_output(self):
+        return is_global_rank_zero()
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index ba5e6a4af5..cae697fbcb 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -44,6 +44,7 @@
     "mixtral",
     "starcoder2",
     "chatglm",
+    "qwen2",
 ]
 __VISION_MODELS_LIST__ = ["vit"]
 __MULTIMODAL_MODELS_LIST__ = [
@@ -881,6 +882,8 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
             / "examples/nlp/language_modeling/megatron_gpt_pretraining.py",
             "mixtral": self._nemo_code_path
             / "examples/nlp/language_modeling/megatron_gpt_pretraining.py",
+            "qwen2": self._nemo_code_path
+            / "examples/nlp/language_modeling/megatron_gpt_pretraining.py",
         }
         return model_type_to_code_path[model_type]
 
@@ -966,6 +969,8 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
             / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py",
             "mixtral": self._nemo_code_path
             / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py",
+            "qwen2": self._nemo_code_path
+            / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py",
         }
         return model_type_to_code_path[model_type]
 
@@ -1113,6 +1118,8 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
             / "examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py",
             "mixtral": self._nemo_code_path
             / "examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py",
+            "qwen2": self._nemo_code_path
+            / "examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py",
         }
         return model_type_to_code_path[model_type]
 
@@ -1668,6 +1675,8 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
             / "examples/nlp/language_modeling/tuning/megatron_gpt_generate.py",
             "peft_mixtral": self._nemo_code_path
             / "examples/nlp/language_modeling/tuning/megatron_gpt_generate.py",
+            "peft_qwen2": self._nemo_code_path
+            / "examples/nlp/language_modeling/tuning/megatron_gpt_generate.py",
             "vit": self._nemo_code_path
             / "examples/vision/vision_transformer/megatron_vit_classification_evaluate.py",
             "clip": self._nemo_code_path