From 99d58473487e5468a96c4c6fc60e7214e67ce0fe Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 12 Aug 2025 09:27:15 -0700
Subject: [PATCH 01/12] add Megatron tests, improve some existing tests

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 ...o-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml} |   8 +-
 ...lama3.1-8b-instruct-4n8g-megatron.v2.yaml} |   8 +-
 ...po-llama3.2-1b-instruct-1n8g-megatron.yaml | 161 ++++++++++++
 .../grpo-moonlight-16ba3b-4n8g-megatron.yaml  | 169 ++++++++++++
 ...rpo-qwen2.5-7b-instruct-4n8g-megatron.yaml | 183 +++++++++++++
 .../llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml  | 156 +++++++++++
 ...lama3.1-70b-8n8g-tp4pp2-long-megatron.yaml | 135 ++++++++++
 ...lama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml |  82 ++++++
 ...> sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml} |  41 +--
 ...l => sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml} |  31 ++-
 ...sft-llama3.1-8b-1n8g-megatron-seqpack.yaml | 127 +++++++++
 ...aml => sft-llama3.1-8b-1n8g-megatron.yaml} |  41 +--
 ... => sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml} |  14 +-
 ...en2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml} |  13 +-
 examples/configs/sft_openmathinstruct2.yaml   |   6 +
 tests/check_metrics.py                        |  29 ++-
 tests/functional/dpo_megatron.sh              |  45 ++++
 tests/functional/sft_megatron.sh              |  45 ++++
 ...a3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh |   3 +-
 ...dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh} |   3 +-
 ...-llama3.1-8b-instruct-4n8g-megatron.v2.sh} |   3 +-
 ...1-8b-instruct-4n8g-megatrontp2pp2-quick.sh |   3 +-
 ...o-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh |   3 +-
 .../llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh    |   3 +-
 ...o-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh |   3 +-
 ...grpo-llama3.2-1b-instruct-1n8g-megatron.sh |  42 +++
 .../grpo-moonlight-16ba3b-4n8g-megatron.sh    |  41 +++
 .../grpo-qwen2.5-7b-instruct-4n8g-megatron.sh |  41 +++
 ...2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh |   3 +-
 .../llm/grpo-qwen3-30ba3b-8n8g-megatron.sh    |  40 +++
 ...-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh |  42 +++
 ...llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh} |  13 +-
 .../llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh |  42 +++
 ....sh => sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh} |  12 +-
 ... sft-llama3.1-8b-1n8g-megatron-seqpack.sh} |   8 +-
 .../llm/sft-llama3.1-8b-1n8g-megatron.sh      |  39 +++
 ...sh => sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh} |  11 +-
 ...qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh} |   2 +-
 tests/test_suites/nightly.txt                 |  20 +-
 tests/test_suites/release.txt                 |  12 +-
 .../models/megatron/converters/test_common.py | 245 ++++++++++++++++++
 41 files changed, 1818 insertions(+), 110 deletions(-)
 rename examples/configs/recipes/llm/{dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.yaml => dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml} (93%)
 rename examples/configs/recipes/llm/{dpo-llama3.1-8b-instruct-4n8g-megatron.yaml => dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml} (95%)
 create mode 100755 examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
 create mode 100644 examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
 create mode 100755 examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
 create mode 100755 examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
 create mode 100644 examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
 create mode 100644 examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
 rename examples/configs/recipes/llm/{sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.yaml => sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml} (66%)
 rename examples/configs/recipes/llm/{sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.yaml => sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml} (69%)
 create mode 100644 examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
 rename examples/configs/recipes/llm/{sft-llama3.1-8b-instruct-1n8g-megatron.yaml => sft-llama3.1-8b-1n8g-megatron.yaml} (74%)
 rename examples/configs/recipes/llm/{sft-llama3.2-1b-1n8g-fsdp2tp1.v2.yaml => sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml} (87%)
 rename examples/configs/recipes/llm/{sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.yaml => sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml} (88%)
 create mode 100755 tests/functional/dpo_megatron.sh
 create mode 100755 tests/functional/sft_megatron.sh
 rename tests/test_suites/llm/{dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.sh => dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh} (91%)
 rename tests/test_suites/llm/{dpo-llama3.1-8b-instruct-4n8g-megatron.sh => dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh} (91%)
 create mode 100755 tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh
 create mode 100755 tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh
 create mode 100755 tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh
 create mode 100755 tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh
 create mode 100755 tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh
 rename tests/test_suites/llm/{sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh => sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh} (81%)
 create mode 100755 tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh
 rename tests/test_suites/llm/{sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh => sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh} (83%)
 rename tests/test_suites/llm/{sft-llama3.1-8b-instruct-1n8g-megatron.sh => sft-llama3.1-8b-1n8g-megatron-seqpack.sh} (87%)
 create mode 100755 tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh
 rename tests/test_suites/llm/{sft-llama3.2-1b-1n8g-fsdp2tp1.v2.sh => sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh} (82%)
 rename tests/test_suites/llm/{sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh => sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh} (96%)
 create mode 100755 tests/unit/models/megatron/converters/test_common.py

diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml
similarity index 93%
rename from examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.yaml
rename to examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml
index 9cd7573ccf..01395f5247 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml
@@ -20,7 +20,7 @@ checkpointing:
   metric_name: "val_loss"
   higher_is_better: false
   keep_top_k: 3
-  save_period: 10000
+  save_period: 50
   checkpoint_must_save_by: null
 
 policy:
@@ -29,14 +29,14 @@ policy:
     name: ${policy.model_name}
   train_global_batch_size: 256
   train_micro_batch_size: 1
-  max_total_sequence_length: 2048
+  max_total_sequence_length: 8192
   precision: "bfloat16"
   dtensor_cfg:
     enabled: true
     cpu_offload: False
     sequence_parallel: false
     activation_checkpointing: false
-    tensor_parallel_size: 1
+    tensor_parallel_size: 4
     context_parallel_size: 1
     custom_parallel_plan: null
 
@@ -85,7 +85,7 @@ logger:
   num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
-    name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
+    name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4
   tensorboard: {}
   gpu_monitoring:
     collection_interval: 10
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
similarity index 95%
rename from examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml
rename to examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
index 55b473b4b6..49e8a78422 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
@@ -20,7 +20,7 @@ checkpointing:
   metric_name: "val_loss"
   higher_is_better: false
   keep_top_k: 3
-  save_period: 10000
+  save_period: 50
   checkpoint_must_save_by: null
 
 policy:
@@ -29,7 +29,7 @@ policy:
     name: ${policy.model_name}
   train_global_batch_size: 256
   train_micro_batch_size: 1
-  max_total_sequence_length: 2048
+  max_total_sequence_length: 8192
   precision: "bfloat16"
   dtensor_cfg:
     enabled: false
@@ -49,7 +49,7 @@ policy:
     enabled: true
     empty_unused_memory_level: 1
     activation_checkpointing: false
-    tensor_model_parallel_size: 2
+    tensor_model_parallel_size: 4
     expert_tensor_parallel_size: 1
     expert_model_parallel_size: 1
     pipeline_model_parallel_size: 1
@@ -118,7 +118,7 @@ logger:
   num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
-    name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
+    name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron
   tensorboard: {}
   gpu_monitoring:
     collection_interval: 10
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
new file mode 100755
index 0000000000..55de4c59f4
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
@@ -0,0 +1,161 @@
+grpo:
+  num_prompts_per_step: 32
+  num_generations_per_prompt: 16
+  max_rollout_turns: 1
+  max_num_steps: 500
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+  seed: 42
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_clip_min: 0.2
+  ratio_clip_max: 0.2
+  ratio_clip_c: null
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+  token_level_loss: true
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-megatron
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 100
+  checkpoint_must_save_by: null
+policy:
+  model_name: meta-llama/Llama-3.2-1B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.2-1B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 4
+  generation_batch_size: 32
+  logprob_batch_size: 4
+  max_total_sequence_length: 512
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  optimizer: null
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 0
+    activation_checkpointing: false
+    tensor_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    sequence_parallel: false
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+    
+    optimizer:
+      optimizer: "adam"
+      lr: 5.0e-6
+      min_lr: 5.0e-7
+      weight_decay: 0.01
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1e-8
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      clip_grad: ${policy.max_grad_norm}
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: null
+      lr_warmup_iters: 50
+      lr_warmup_init: 5.0e-7
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: "optim_grads_params"
+
+  dtensor_cfg:
+    enabled: false
+  dynamic_batching:
+    enabled: False
+  sequence_packing:
+    enabled: True
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  generation:
+    backend: vllm
+    max_new_tokens: 512
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 128009
+    stop_strings: null
+    vllm_cfg:
+      async_engine: false
+      precision: ${policy.precision}
+      tensor_parallel_size: 1
+      pipeline_parallel_size: 1
+      gpu_memory_utilization: 0.6
+      max_model_len: 512
+      enforce_eager: False
+    colocated:
+      enabled: true
+      resources:
+        gpus_per_node: null
+        num_nodes: null
+data:
+  max_input_seq_length: 512
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+  shuffle: true
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-megatron
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  mlflow_enabled: False
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.2-1b-instruct-1n8g-megatron
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
new file mode 100644
index 0000000000..42b365f351
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
@@ -0,0 +1,169 @@
+# GRPO Algorithm Configuration
+defaults: "../../grpo_math_1B.yaml"
+
+grpo:
+  num_prompts_per_step: 32
+  num_generations_per_prompt: 16
+  max_num_steps: 1000000
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: -1
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+
+loss_fn:
+  reference_policy_kl_penalty: 0.04
+  ratio_clip_min: 0.2
+  ratio_clip_max: 0.2
+  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+  token_level_loss: true
+  ratio_clip_c: null
+
+checkpointing:
+  enabled: false
+  checkpoint_dir: "results/grpo_megatron"
+  metric_name: "val_reward"
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10000
+
+policy:
+  model_name: "moonshotai/Moonlight-16B-A3B-Instruct"
+  tokenizer:
+    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  generation_batch_size: 64 # Only used when generating using megatron backend
+  logprob_batch_size: 1
+  max_total_sequence_length: 8192
+  precision: "bfloat16"
+
+  dtensor_cfg:
+    enabled: false
+
+  # dynamic_batching improves performance by ensuring logprob and training microbatches
+  # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
+  # responses are sorted by sequence length and bucketed into microbatches with a total
+  # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the
+  # training and logprob stages respectively.
+  dynamic_batching:
+    enabled: False
+
+  sequence_packing:
+    enabled: False # coming soon
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_ffd"
+    sequence_length_round: 64
+
+  max_grad_norm: 1.0
+  # makes the training sequence length divisible by the tensor parallel size
+  # this is useful for sequence parallel training
+  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+
+  optimizer: null # remove default FSDP optimizer
+
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 0
+    activation_checkpointing: false
+    converter_type: "Qwen2ForCausalLM"
+    tensor_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 4
+    pipeline_model_parallel_size: 4
+    num_layers_in_first_pipeline_stage: 7
+    num_layers_in_last_pipeline_stage: 6
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    sequence_parallel: false
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    #gives ~20% training perf speedup with sequence packing 
+    # Causes logprob error divergence for moonlight
+    apply_rope_fusion: False
+    
+    optimizer:
+      optimizer: "adam"
+      lr: 1.0e-6
+      min_lr: 5.0e-7
+      weight_decay: 0.01
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1e-8
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      clip_grad: ${policy.max_grad_norm}
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: null
+      lr_warmup_iters: 50
+      lr_warmup_init: 5.0e-7
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: "optim_grads_params"
+
+  generation:
+    backend: "vllm"
+    max_new_tokens: ${policy.max_total_sequence_length}
+    temperature: 1.0
+    top_p: 1.0
+    top_k: null
+    vllm_cfg:
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.6
+      max_model_len: ${policy.max_total_sequence_length}
+
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
+  prompt_file: "examples/prompts/cot.txt"
+  system_prompt_file: null
+  dataset_name: "OpenMathInstruct-2"
+
+env:
+  math:
+    num_workers: 8
+
+logger:
+  log_dir: "logs"  # Base directory for all logs
+  num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
+  wandb_enabled: false
+  tensorboard_enabled: false
+  mlflow_enabled: False
+  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  wandb:
+    project: "grpo-dev"
+    name: "grpo-moonlight-16B-A3B-Instruct"
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+
+cluster:
+  gpus_per_node: 8
+  num_nodes: 4
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
new file mode 100755
index 0000000000..d099931839
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
@@ -0,0 +1,183 @@
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+  max_rollout_turns: 1
+  max_num_steps: 30
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+  seed: 42
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_clip_min: 0.2
+  ratio_clip_max: 0.2
+  ratio_clip_c: null
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+  token_level_loss: true
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-megatron
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 100
+  checkpoint_must_save_by: null
+policy:
+  model_name: Qwen/Qwen2.5-7B-Instruct
+  tokenizer:
+    name: ${policy.model_name}
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  generation_batch_size: 32
+  logprob_batch_size: 2
+  max_total_sequence_length: 4096
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  dtensor_cfg:
+    enabled: false
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 0
+    activation_checkpointing: false
+    converter_type: "Qwen2ForCausalLM"
+    tensor_model_parallel_size: 2
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    sequence_parallel: false
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+    
+    optimizer:
+      optimizer: "adam"
+      lr: 5.0e-6
+      min_lr: 5.0e-7
+      weight_decay: 0.01
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1e-8
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      clip_grad: ${policy.max_grad_norm}
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: null
+      lr_warmup_iters: 50
+      lr_warmup_init: 5.0e-7
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: "optim_grads_params"
+  dynamic_batching:
+    enabled: false
+  sequence_packing:
+    enabled: true
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+  make_sequence_length_divisible_by: 4
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 3e-07
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.999
+      eps: 1e-08
+      foreach: false
+      fused: false
+  scheduler:
+    - name: torch.optim.lr_scheduler.LinearLR
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1
+        total_iters: 13
+    - name: torch.optim.lr_scheduler.ConstantLR
+      kwargs:
+        factor: 1
+        total_iters: 10000000000
+    - milestones:
+        - 13
+  generation:
+    backend: vllm
+    max_new_tokens: 4096
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 151645
+    stop_strings: null
+    vllm_cfg:
+      async_engine: false
+      precision: ${policy.precision}
+      tensor_parallel_size: 4
+      pipeline_parallel_size: 1
+      gpu_memory_utilization: 0.6
+      max_model_len: 4096
+      enforce_eager: False
+    colocated:
+      enabled: true
+      resources:
+        gpus_per_node: null
+        num_nodes: null
+data:
+  max_input_seq_length: 4096
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+  shuffle: true
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-megatron
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  mlflow_enabled: False
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen2.5-7b-instruct-4n8g-megatron
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 4
diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
new file mode 100755
index 0000000000..89e434d3dc
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
@@ -0,0 +1,156 @@
+# GRPO Algorithm Configuration
+defaults: "../../grpo_math_1B.yaml"
+
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+  max_num_steps: 1000000
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_clip_min: 0.2
+  ratio_clip_max: 0.2
+  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+  token_level_loss: true
+  ratio_clip_c: null
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-qwen3-30ba3b-8n8g-megatron
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+  checkpoint_must_save_by: null
+policy:
+  model_name: "Qwen/Qwen3-30B-A3B"
+  tokenizer:
+    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  generation_batch_size: 32 # Only used when generating using HF backend
+  logprob_batch_size: 4
+  max_total_sequence_length: 4096
+  precision: "bfloat16"
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+
+  dtensor_cfg:
+    enabled: false
+
+  optimizer: null # remove default FSDP optimizer
+
+  scheduler: null # remove default FSDP scheduler
+
+  dynamic_batching:
+    enabled: False
+  sequence_packing:
+    enabled: False # coming soon
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_ffd"
+    sequence_length_round: 64
+  max_grad_norm: 1.0
+  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 1
+    activation_checkpointing: false
+    tensor_model_parallel_size: 4
+    pipeline_model_parallel_size: 4
+    context_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 4
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    sequence_parallel: True
+    pipeline_dtype: ${policy.precision}
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+
+    optimizer:
+      optimizer: "adam"
+      lr: 3.0e-7
+      min_lr: 3.0e-8
+      weight_decay: 0.01
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+      clip_grad: ${policy.max_grad_norm}
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1e-8
+      #sgd
+      sgd_momentum: 0.9
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: null
+      lr_warmup_iters: 50
+      lr_warmup_init: 3.0e-8
+    
+    env_vars:
+      PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
+    
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: "optim_grads_params"
+
+  generation:
+    backend: "vllm"
+    max_new_tokens: ${policy.max_total_sequence_length}
+    temperature: 1.0
+    top_p: 1.0
+    top_k: null
+    stop_token_ids: null
+    stop_strings: null
+    vllm_cfg:
+      tensor_parallel_size: 4
+      gpu_memory_utilization: 0.7
+      max_model_len: ${policy.max_total_sequence_length}
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
+  prompt_file: "examples/prompts/cot.txt"
+  system_prompt_file: null
+  dataset_name: "OpenMathInstruct-2"
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-qwen3-30ba3b-8n8g-megatron
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  mlflow_enabled: False
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen3-30ba3b-8n8g-megatron
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 8
diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
new file mode 100644
index 0000000000..08b015512a
--- /dev/null
+++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
@@ -0,0 +1,135 @@
+sft:
+  max_num_epochs: 1
+  max_num_steps: 1000000
+  val_period: 500
+  val_batches: 4
+  val_global_batch_size: 128
+  val_micro_batch_size: 1
+  val_at_start: false
+  seed: 42
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron
+  metric_name: val_loss
+  higher_is_better: false
+  keep_top_k: 3
+  save_period: 100
+  checkpoint_must_save_by: null
+policy:
+  model_name: "meta-llama/Llama-3.1-70B"
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct ## specify if you'd like to use a tokenizer different from the model's default
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  max_total_sequence_length: 4096
+  precision: "bfloat16"
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  dtensor_cfg:
+    enabled: false
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 1
+    activation_checkpointing: false
+    tensor_model_parallel_size: 4
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: 2
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    sequence_parallel: false
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+    
+    optimizer:
+      optimizer: "adam"
+      lr: 2e-5
+      min_lr: 2e-5
+      weight_decay: 0.01
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1e-8
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      clip_grad: 0.0
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: null
+      lr_warmup_iters: 1
+      lr_warmup_init: 2e-5
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: "optim_grads_params"
+  dynamic_batching:
+    enabled: false
+  sequence_packing:
+    enabled: false
+  # makes the training sequence length divisible by the tensor parallel size
+  # this is useful for sequence parallel training
+  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  max_grad_norm: null
+  optimizer:
+    name: "torch.optim.AdamW"
+    kwargs:
+      lr: 2e-5
+      weight_decay: 0.01
+      betas: [0.9, 0.98]
+      eps: 1e-8
+      # when using Dtensor, we need to set foreach
+      # and fused to False
+      foreach: False
+      fused: False
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: "openmathinstruct2"
+  prompt_file: examples/prompts/math.txt
+  split: "train_1M"
+  add_bos: true
+  add_eos: true
+  add_generation_prompt: true
+  output_key: 'generated_solution'
+  seed: 42
+  shuffle: true
+logger:
+  log_dir: "logs"  # Base directory for all logs
+  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
+  tensorboard_enabled: true
+  mlflow_enabled: False
+  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  wandb:
+    project: "sft-dev"
+    name: "openmathinstruct-nemorl-1M_train"
+  tensorboard:
+    log_dir: "tb_logs-openmathinstruct-nemorl-1M_train"
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+cluster:
+  gpus_per_node: 8
+  num_nodes: 8
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
new file mode 100644
index 0000000000..b04201ac9f
--- /dev/null
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
@@ -0,0 +1,82 @@
+sft:
+  max_num_epochs: 1
+  max_num_steps: 10000
+  val_period: 500
+  val_batches: 4
+  val_global_batch_size: 128
+  val_micro_batch_size: 2
+  val_at_start: false
+  seed: 42
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
+  metric_name: val_loss
+  higher_is_better: false
+  keep_top_k: 3
+  save_period: 50
+  checkpoint_must_save_by: null
+policy:
+  model_name: meta-llama/Llama-3.1-8B
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 2
+  max_total_sequence_length: 4096
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 4
+    context_parallel_size: 1
+    custom_parallel_plan: null
+  dynamic_batching:
+    enabled: true
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    sequence_length_round: 64
+  sequence_packing:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 2e-5
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.98
+      eps: 1e-08
+      foreach: false
+      fused: false
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: "openmathinstruct2"
+  prompt_file: examples/prompts/math.txt
+  split: "train_1M"
+  add_bos: true
+  add_eos: true
+  add_generation_prompt: true
+  output_key: 'generated_solution'
+  seed: 42
+  shuffle: true
+logger:
+  log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
+  wandb_enabled: true
+  tensorboard_enabled: true
+  mlflow_enabled: false
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
+  tensorboard:
+    log_dir: tb_logs-sft-dev-squad
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
similarity index 66%
rename from examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.yaml
rename to examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
index 409c21670b..20adece59d 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
@@ -1,11 +1,11 @@
 sft:
   max_num_epochs: 1
-  max_num_steps: 2730
-  val_period: 10
-  val_batches: 8
-  val_global_batch_size: 32
-  val_micro_batch_size: 1
-  val_at_start: true
+  max_num_steps: 10000
+  val_period: 500
+  val_batches: 4
+  val_global_batch_size: 128
+  val_micro_batch_size: 2
+  val_at_start: false
   seed: 42
 checkpointing:
   enabled: true
@@ -13,16 +13,15 @@ checkpointing:
   metric_name: val_loss
   higher_is_better: false
   keep_top_k: 3
-  save_period: 10
+  save_period: 100
   checkpoint_must_save_by: null
 policy:
-  model_name: meta-llama/Llama-3.1-8B-Instruct
+  model_name: meta-llama/Llama-3.1-8B
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
-    chat_template: '{% for message in messages %}{%- if message[''role''] == ''system''  %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user''  %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant''  %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}'
-  train_global_batch_size: 32
-  train_micro_batch_size: 1
-  max_total_sequence_length: 1024
+  train_global_batch_size: 512
+  train_micro_batch_size: 2
+  max_total_sequence_length: 4096
   precision: bfloat16
   dtensor_cfg:
     enabled: true
@@ -34,6 +33,8 @@ policy:
     custom_parallel_plan: null
   dynamic_batching:
     enabled: false
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    sequence_length_round: 64
   sequence_packing:
     enabled: false
     train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
@@ -44,21 +45,25 @@ policy:
   optimizer:
     name: torch.optim.AdamW
     kwargs:
-      lr: 5e-06
-      weight_decay: 0.1
+      lr: 2e-5
+      weight_decay: 0.01
       betas:
         - 0.9
         - 0.98
-      eps: 1e-05
+      eps: 1e-08
       foreach: false
       fused: false
 data:
-  max_input_seq_length: 1024
-  dataset_name: squad
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: "openmathinstruct2"
+  prompt_file: examples/prompts/math.txt
+  split: "train_1M"
   add_bos: true
   add_eos: true
-  add_generation_prompt: false
+  add_generation_prompt: true
+  output_key: 'generated_solution'
   shuffle: true
+  seed: 42
 logger:
   log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
   wandb_enabled: true
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
similarity index 69%
rename from examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.yaml
rename to examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
index 1ac548c354..e9abb0771f 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
@@ -1,7 +1,7 @@
 sft:
   max_num_epochs: 1
   max_num_steps: 350
-  val_period: 10
+  val_period: 500
   val_batches: 8
   val_global_batch_size: 32
   val_micro_batch_size: 1
@@ -13,16 +13,15 @@ checkpointing:
   metric_name: val_loss
   higher_is_better: false
   keep_top_k: 3
-  save_period: 10
+  save_period: 20
   checkpoint_must_save_by: null
 policy:
-  model_name: meta-llama/Llama-3.1-8B-Instruct
+  model_name: meta-llama/Llama-3.1-8B
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
-    chat_template: '{% for message in messages %}{%- if message[''role''] == ''system''  %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user''  %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant''  %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}'
-  train_global_batch_size: 32
-  train_micro_batch_size: 1
-  max_total_sequence_length: 1024
+  train_global_batch_size: 512
+  train_micro_batch_size: 2
+  max_total_sequence_length: 4096
   precision: bfloat16
   dtensor_cfg:
     enabled: true
@@ -44,21 +43,25 @@ policy:
   optimizer:
     name: torch.optim.AdamW
     kwargs:
-      lr: 5e-06
-      weight_decay: 0.1
+      lr: 2e-5
+      weight_decay: 0.01
       betas:
         - 0.9
         - 0.98
-      eps: 1e-05
+      eps: 1e-08
       foreach: false
       fused: false
 data:
-  max_input_seq_length: 1024
-  dataset_name: squad
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: "openmathinstruct2"
+  prompt_file: examples/prompts/math.txt
+  split: "train_1M"
   add_bos: true
   add_eos: true
-  add_generation_prompt: false
+  add_generation_prompt: true
+  output_key: 'generated_solution'
   shuffle: true
+  seed: 42
 logger:
   log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp
   wandb_enabled: true
@@ -70,7 +73,7 @@ logger:
     project: nemo-rl
     name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp
   tensorboard:
-    log_dir: tb_logs-sft-dev-squad
+    log_dir: tb_logs-sft-dev-openmathinstruct2
   gpu_monitoring:
     collection_interval: 10
     flush_interval: 10
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
new file mode 100644
index 0000000000..c2791b3074
--- /dev/null
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
@@ -0,0 +1,127 @@
+sft:
+  max_num_epochs: 1
+  max_num_steps: 250
+  val_period: 500
+  val_batches: 8
+  val_global_batch_size: 32
+  val_micro_batch_size: 1
+  val_at_start: true
+  seed: 42
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-megatron
+  metric_name: val_loss
+  higher_is_better: false
+  keep_top_k: 3
+  save_period: 50
+  checkpoint_must_save_by: null
+policy:
+  model_name: meta-llama/Llama-3.1-8B
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 2
+  max_total_sequence_length: 4096
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  dtensor_cfg:
+    enabled: false
+  dynamic_batching:
+    enabled: false
+  sequence_packing:
+    enabled: true
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  max_grad_norm: 1
+  optimizer: null
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 1
+    activation_checkpointing: false
+    tensor_model_parallel_size: 2
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: 2
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    sequence_parallel: false
+    freeze_moe_router: false
+    moe_router_dtype: null
+    moe_router_load_balancing_type: "aux_loss"
+    moe_router_bias_update_rate: 1e-3
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+    
+    optimizer:
+      optimizer: "adam"
+      lr: 2.0e-5
+      min_lr: 1.99999e-5
+      weight_decay: 0.01
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.98
+      adam_eps: 1e-5
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      clip_grad: ${policy.max_grad_norm}
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: null
+      lr_warmup_iters: 50
+      lr_warmup_init: 1.9999e-65
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      data_parallel_sharding_strategy: "optim_grads_params"
+
+
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: "openmathinstruct2"
+  prompt_file: examples/prompts/math.txt
+  split: "train_1M"
+  add_bos: true
+  add_eos: true
+  add_generation_prompt: true
+  output_key: 'generated_solution'
+  seed: 42
+  shuffle: true
+logger:
+  log_dir: logs/sft-llama3.1-8b-1n8g-megatron
+  wandb_enabled: true
+  tensorboard_enabled: true
+  mlflow_enabled: false
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: sft-llama3.1-8b-1n8g-megatron
+  tensorboard:
+    log_dir: tb_logs-sft-dev-openmathinstruct2
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
similarity index 74%
rename from examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml
rename to examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
index 5e7bc8b8d7..28331f4c69 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
@@ -1,28 +1,27 @@
 sft:
   max_num_epochs: 1
   max_num_steps: 250
-  val_period: 10
+  val_period: 500
   val_batches: 8
   val_global_batch_size: 32
   val_micro_batch_size: 1
   val_at_start: true
   seed: 42
 checkpointing:
-  enabled: false #true
-  checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp1
+  enabled: true
+  checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-megatron
   metric_name: val_loss
   higher_is_better: false
   keep_top_k: 3
-  save_period: 10
+  save_period: 100
   checkpoint_must_save_by: null
 policy:
-  model_name: meta-llama/Llama-3.1-8B-Instruct
+  model_name: meta-llama/Llama-3.1-8B
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
-    chat_template: '{% for message in messages %}{%- if message[''role''] == ''system''  %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user''  %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant''  %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}'
-  train_global_batch_size: 32
+  train_global_batch_size: 512
   train_micro_batch_size: 2
-  max_total_sequence_length: 1024
+  max_total_sequence_length: 4096
   precision: bfloat16
   dtensor_cfg:
     enabled: false
@@ -58,10 +57,10 @@ policy:
     
     optimizer:
       optimizer: "adam"
-      lr: 5.0e-6
-      min_lr: 4.9999e-6
-      weight_decay: 0.1
-      bf16: false
+      lr: 2.0e-5
+      min_lr: 1.99999e-5
+      weight_decay: 0.01
+      bf16: true
       fp16: false
       params_dtype: "float32"
 
@@ -86,7 +85,7 @@ policy:
       lr_decay_style: "constant"
       lr_decay_iters: null
       lr_warmup_iters: 50
-      lr_warmup_init: 4.9999e-6
+      lr_warmup_init: 1.9999e-65
 
     distributed_data_parallel_config:
       grad_reduce_in_fp32: false
@@ -97,14 +96,18 @@ policy:
 
 
 data:
-  add_generation_prompt: false
-  max_input_seq_length: 1024
-  dataset_name: squad
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: "openmathinstruct2"
+  prompt_file: examples/prompts/math.txt
+  split: "train_1M"
   add_bos: true
   add_eos: true
+  add_generation_prompt: true
+  output_key: 'generated_solution'
   shuffle: true
+  seed: 42
 logger:
-  log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp1
+  log_dir: logs/sft-llama3.1-8b-1n8g-megatron
   wandb_enabled: true
   tensorboard_enabled: true
   mlflow_enabled: false
@@ -112,9 +115,9 @@ logger:
   num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
-    name: sft-llama3.1-8b-instruct-1n8g-fsdp1
+    name: sft-llama3.1-8b-1n8g-megatron
   tensorboard:
-    log_dir: tb_logs-sft-dev-squad
+    log_dir: tb_logs-sft-dev-openmathinstruct2
   gpu_monitoring:
     collection_interval: 10
     flush_interval: 10
diff --git a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.yaml b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml
similarity index 87%
rename from examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.yaml
rename to examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml
index 6a5eb97f6f..de2e6f9eee 100644
--- a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml
@@ -13,7 +13,7 @@ checkpointing:
   metric_name: val_loss
   higher_is_better: false
   keep_top_k: 3
-  save_period: 10
+  save_period: 100
   checkpoint_must_save_by: null
 policy:
   model_name: meta-llama/Llama-3.2-1B
@@ -53,12 +53,16 @@ policy:
       foreach: false
       fused: false
 data:
-  max_input_seq_length: 1024
-  dataset_name: squad 
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: "openmathinstruct2"
+  prompt_file: examples/prompts/math.txt
+  split: "train_1M"
   add_bos: true
   add_eos: true
-  add_generation_prompt: false
+  add_generation_prompt: true
+  output_key: 'generated_solution'
   shuffle: true
+  seed: 42
 logger:
   log_dir: logs/sft-llama3.2-1b-1n8g-fsdp2tp1
   wandb_enabled: true
@@ -70,7 +74,7 @@ logger:
     project: nemo-rl
     name: sft-llama3.2-1b-1n8g-fsdp2tp1
   tensorboard:
-    log_dir: tb_logs-sft-dev-squad
+    log_dir: tb_logs-sft-dev-openmathinstruct2
   gpu_monitoring:
     collection_interval: 10
     flush_interval: 10
diff --git a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.yaml b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml
similarity index 88%
rename from examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.yaml
rename to examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml
index 516cda4bbb..85f4e7f111 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml
@@ -13,7 +13,7 @@ checkpointing:
   metric_name: val_loss
   higher_is_better: false
   keep_top_k: 3
-  save_period: 10
+  save_period: 100
   checkpoint_must_save_by: null
 policy:
   model_name: Qwen/Qwen2.5-32B
@@ -53,11 +53,14 @@ policy:
       foreach: false
       fused: false
 data:
-  max_input_seq_length: 16000
-  dataset_name: squad
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: "openmathinstruct2"
+  prompt_file: examples/prompts/math.txt
+  split: "train_1M"
   add_bos: true
   add_eos: true
-  add_generation_prompt: false
+  add_generation_prompt: true
+  output_key: 'generated_solution'
   shuffle: true
 logger:
   log_dir: logs/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt
@@ -70,7 +73,7 @@ logger:
     project: nemo-rl
     name: sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt
   tensorboard:
-    log_dir: tb_logs-sft-dev-squad
+    log_dir: tb_logs-sft-dev-openmathinstruct2
   gpu_monitoring:
     collection_interval: 10
     flush_interval: 10
diff --git a/examples/configs/sft_openmathinstruct2.yaml b/examples/configs/sft_openmathinstruct2.yaml
index 640911bc0f..bbea9f1767 100644
--- a/examples/configs/sft_openmathinstruct2.yaml
+++ b/examples/configs/sft_openmathinstruct2.yaml
@@ -36,11 +36,17 @@ policy:
     context_parallel_size: 1
     custom_parallel_plan: null
 
+  megatron_cfg:
+    enabled: false
+
   dynamic_batching:
     enabled: false
 
   sequence_packing:
     enabled: false
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
 
   # makes the training sequence length divisible by the tensor parallel size
   # this is useful for sequence parallel training
diff --git a/tests/check_metrics.py b/tests/check_metrics.py
index a48c2f4875..df97d22a1f 100644
--- a/tests/check_metrics.py
+++ b/tests/check_metrics.py
@@ -31,9 +31,32 @@ def max(value):
     return __builtins__.max(float(v) for v in value.values())
 
 
-def mean(value):
-    """Return the mean of values in a dictionary."""
-    return statistics.mean(float(v) for v in value.values())
+def mean(value, range_start=1, range_end=0):
+    """Return the mean of values (or a range of values) in a dictionary.
+
+    Note:
+        step, and ranges, are 1 indexed. Range_end is exclusive.
+        range_end=0 means to include until the last step in the run
+    """
+
+    ## find potential offset that might arise from resuming from a checkpoint
+    max_step_reached = __builtins__.max([int(s) for s in value.keys()])
+    ## this is the number of steps that occurred prior to resuming
+    offset = max_step_reached - len(value)
+
+    num_elem = len(value)
+    if range_start < 0:
+        range_start += num_elem + 1 + offset
+    if range_end <= 0:
+        range_end += num_elem + 1 + offset
+
+    vals = []
+    for step, v in value.items():
+        if range_start <= int(step) and int(step) < range_end:
+            vals.append(float(v))
+
+    print(vals)
+    return statistics.mean(vals)
 
 
 def evaluate_check(data: dict, check: str) -> tuple[bool, str, object]:
diff --git a/tests/functional/dpo_megatron.sh b/tests/functional/dpo_megatron.sh
new file mode 100755
index 0000000000..8c1524c2c5
--- /dev/null
+++ b/tests/functional/dpo_megatron.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# clean up checkpoint directory on exit
+trap "rm -rf /tmp/sft_checkpoints" EXIT
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetches metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+set -eou pipefail
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+rm -rf $EXP_DIR $LOG_DIR
+mkdir -p $EXP_DIR $LOG_DIR
+
+cd $PROJECT_ROOT
+uv run $PROJECT_ROOT/examples/run_dpo.py \
+    --config $PROJECT_ROOT/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml \
+    policy.model_name=Qwen/Qwen3-0.6B \
+    cluster.gpus_per_node=2 \
+    dpo.max_num_steps=3 \
+    dpo.val_batches=1 \
+    dpo.val_period=3 \
+    logger.tensorboard_enabled=true \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=false \
+    logger.monitor_gpus=true \
+    checkpointing.enabled=false \
+    policy.megatron_cfg.tensor_model_parallel_size=1 \
+    policy.train_global_batch_size=8 \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+uv run tests/check_metrics.py $JSON_METRICS \
+  'data["train/loss"]["3"] < 5' \
+
diff --git a/tests/functional/sft_megatron.sh b/tests/functional/sft_megatron.sh
new file mode 100755
index 0000000000..dfb7fcfdba
--- /dev/null
+++ b/tests/functional/sft_megatron.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# clean up checkpoint directory on exit
+trap "rm -rf /tmp/sft_checkpoints" EXIT
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetches metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+set -eou pipefail
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+rm -rf $EXP_DIR $LOG_DIR
+mkdir -p $EXP_DIR $LOG_DIR
+
+cd $PROJECT_ROOT
+uv run $PROJECT_ROOT/examples/run_sft.py \
+    --config $PROJECT_ROOT/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml \
+    policy.model_name=Qwen/Qwen3-0.6B \
+    policy.tokenizer.name=Qwen/Qwen3-0.6B \
+    cluster.gpus_per_node=2 \
+    sft.max_num_steps=3 \
+    sft.val_batches=1 \
+    sft.val_period=3 \
+    logger.tensorboard_enabled=true \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=false \
+    logger.monitor_gpus=true \
+    checkpointing.enabled=false \
+    policy.megatron_cfg.pipeline_model_parallel_size=1 \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+uv run tests/check_metrics.py $JSON_METRICS \
+  'data["train/loss"]["3"] < 0.8' \
+
diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh
index f5b29b7db7..0162bd8bb9 100755
--- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh
+++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh
@@ -38,5 +38,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["20"] < 3.4' \
         'data["train/preference_loss"]["1"] > 0.69314' \
         'data["train/preference_loss"]["1"] < 0.69316' \
-        'data["train/preference_loss"]["20"] < 0.6'
+        'data["train/preference_loss"]["20"] < 0.6' \
+        'mean(data["timing/train/total_step_time"], -10, -1) < 7.8'
 fi 
diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh
similarity index 91%
rename from tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.sh
rename to tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh
index e9ccb1e147..df74127ba2 100755
--- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.sh
+++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh
@@ -38,5 +38,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["150"] < 3.0' \
         'data["train/preference_loss"]["1"] > 0.69314' \
         'data["train/preference_loss"]["1"] < 0.69316' \
-        'data["train/preference_loss"]["150"] < 0.4'
+        'data["train/preference_loss"]["150"] < 0.4' \
+        'mean(data["timing/train/total_step_time"], -11, -1) < 24'
 fi 
diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh
similarity index 91%
rename from tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.sh
rename to tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh
index e9ccb1e147..8701d63d1f 100755
--- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.sh
+++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh
@@ -38,5 +38,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["150"] < 3.0' \
         'data["train/preference_loss"]["1"] > 0.69314' \
         'data["train/preference_loss"]["1"] < 0.69316' \
-        'data["train/preference_loss"]["150"] < 0.4'
+        'data["train/preference_loss"]["150"] < 0.4' \
+        'mean(data["timing/train/total_step_time"], -11, -1) < 11.5'
 fi 
diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh
index f5b29b7db7..0bc8e13e28 100755
--- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh
+++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh
@@ -38,5 +38,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["20"] < 3.4' \
         'data["train/preference_loss"]["1"] > 0.69314' \
         'data["train/preference_loss"]["1"] < 0.69316' \
-        'data["train/preference_loss"]["20"] < 0.6'
+        'data["train/preference_loss"]["20"] < 0.6' \
+        'mean(data["timing/train/total_step_time"], -10) < 6.7'
 fi 
diff --git a/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh b/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh
index 6606099df7..48691c0df4 100755
--- a/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh
+++ b/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh
@@ -36,5 +36,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
     uv run tests/check_metrics.py $JSON_METRICS \
         'data["train/loss"]["1"] > 0.69314' \
         'data["train/loss"]["1"] < 0.69316' \
-        'data["train/loss"]["150"] < 0.55'
+        'data["train/loss"]["150"] < 0.55' \
+        'mean(data["timing/train/total_step_time"], -11, -1) < 1.3'
 fi 
diff --git a/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh
index 94781e4931..4624b7282d 100755
--- a/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh
+++ b/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh
@@ -35,5 +35,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
         'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        "data[\"train/token_mult_prob_error\"][\"${MAX_STEPS}\"] < 1.1"
+        "data[\"train/token_mult_prob_error\"][\"${MAX_STEPS}\"] < 1.1" \
+        'mean(data["timing/train/total_step_time"], -6, -1) < 14'
 fi
diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh
index 45cfad6e83..3661370fa6 100755
--- a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh
+++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh
@@ -35,6 +35,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
         'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["500"] < 1.1'
+        'data["train/token_mult_prob_error"]["500"] < 1.1' \
+        'mean(data["timing/train/total_step_time"], -6, -1) < 10'
 fi
 
diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh
new file mode 100755
index 0000000000..83071c70e3
--- /dev/null
+++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=500
+MAX_STEPS=500
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=180
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["500"] < 1.1' \
+        'data["train/reward"]["500"] > 0.1' \
+        'mean(data["timing/train/total_step_time"], -6, -1) < 10.5'
+
+fi
diff --git a/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh
new file mode 100755
index 0000000000..7288252eec
--- /dev/null
+++ b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=30
+MAX_STEPS=30
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=150
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+PYTHONPATH=$HF_HOME/modules:$PYTHONPATH uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["30"] < 1.1' \
+        'mean(data["train/reward"]) > 0.45' \
+        'mean(data["timing/train/total_step_time"], -11, -1) < 70'
+fi
diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh
new file mode 100755
index 0000000000..45f354043a
--- /dev/null
+++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=30
+MAX_STEPS=30
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=180
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["30"] < 1.1' \
+	'mean(data["train/reward"]) > 0.56' \
+        'mean(data["timing/train/total_step_time"], 2) < 50'
+fi
diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh
index 98df00c25c..0a31e74590 100755
--- a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh
+++ b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh
@@ -35,6 +35,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
         'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["450"] < 1.1'
+        'data["train/token_mult_prob_error"]["450"] < 1.1' \
+        'mean(data["timing/train/total_step_time"], 2) < 25'
 fi
 
diff --git a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh
new file mode 100755
index 0000000000..f89041cd40
--- /dev/null
+++ b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=8
+STEPS_PER_RUN=30
+MAX_STEPS=30
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/token_mult_prob_error"]["30"] < 1.1' \
+        'data["train/reward"]["30"] > 0.43' \
+        'mean(data["timing/train/total_step_time"], -6, -1) < 220'
+fi
diff --git a/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh b/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh
new file mode 100755
index 0000000000..718322e33a
--- /dev/null
+++ b/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=8
+STEPS_PER_RUN=300
+MAX_STEPS=300
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_sft.py \
+    --config $CONFIG_PATH \
+    sft.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# TODO: the memory check is known to OOM. see https://github.com/NVIDIA-NeMo/RL/issues/263
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 0.55' \
+        'data["train/loss"]["300"] < 0.285' \
+        'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \
+        'mean(data["timing/train/total_step_time"], 2) < 20'
+fi
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh
similarity index 81%
rename from tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh
rename to tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh
index b22c00dec0..76c600c648 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh
@@ -2,11 +2,10 @@
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 source $SCRIPT_DIR/common.env
 
-# TODO: @ashors real convergence run (dataset only has 2737)
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
-STEPS_PER_RUN=2730
-MAX_STEPS=2730
+STEPS_PER_RUN=250
+MAX_STEPS=250
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=120
 # ===== END CONFIG =====
@@ -35,9 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # TODO: the memory check is known to OOM. see https://github.com/NVIDIA-NeMo/RL/issues/263
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    # TODO: FIGURE OUT CORRECT METRICS
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 5' \
-        'data["train/loss"]["2730"] < 0.3' \
-        'max(data["ray/node.0.gpu.0.mem_gb"]) < 50'
+	'data["train/loss"]["1"] < 0.6' \
+        'data["train/loss"]["250"] < 0.36' \
+        'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \
+        'mean(data["timing/train/total_step_time"], 2) < 10'
 fi
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh
new file mode 100755
index 0000000000..90fd03467c
--- /dev/null
+++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=250
+MAX_STEPS=250
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=120
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_sft.py \
+    --config $CONFIG_PATH \
+    sft.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# TODO: the memory check is known to OOM. see https://github.com/NVIDIA-NeMo/RL/issues/263
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 0.6' \
+        'data["train/loss"]["250"] < 0.36' \
+	'max(data["ray/node.0.gpu.0.mem_gb"]) < 80' \
+        'mean(data["timing/train/total_step_time"], 2) < 22'
+fi
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh
similarity index 83%
rename from tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh
rename to tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh
index abed80e5ed..8f69d0f0b8 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh
@@ -4,8 +4,8 @@ source $SCRIPT_DIR/common.env
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
-STEPS_PER_RUN=350
-MAX_STEPS=350
+STEPS_PER_RUN=50
+MAX_STEPS=50
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=45
 # ===== END CONFIG =====
@@ -35,9 +35,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    # TODO: FIGURE OUT CORRECT METRICS
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 5' \
-        'data["train/loss"]["350"] < 0.5' \
-        'max(data["ray/node.0.gpu.0.mem_gb"]) < 45'
+        'data["train/loss"]["1"] < 0.6' \
+        'data["train/loss"]["50"] < 0.38' \
+        'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \
+        'mean(data["timing/train/total_step_time"], 2) < 32'
 fi 
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-megatron.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh
similarity index 87%
rename from tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-megatron.sh
rename to tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh
index cf72bd9377..fe54af1fbd 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-megatron.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh
@@ -31,9 +31,9 @@ uv run examples/run_sft.py \
 # Convert tensorboard logs to json
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
-# TODO: @ashors tighter bounds
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 2' \
-        'data["train/loss"]["250"] < 0.3'
-fi 
\ No newline at end of file
+        'data["train/loss"]["1"] < 0.6' \
+        'data["train/loss"]["250"] < 0.36' \
+        'mean(data["timing/train/total_step_time"], 2) < 6'
+fi 
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh
new file mode 100755
index 0000000000..bc5eae73a2
--- /dev/null
+++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=250
+MAX_STEPS=250
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=120
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_sft.py \
+    --config $CONFIG_PATH \
+    sft.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 0.6' \
+        'data["train/loss"]["250"] < 0.36' \
+        'mean(data["timing/train/total_step_time"], 2) < 20'
+fi 
diff --git a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.sh b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh
similarity index 82%
rename from tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.sh
rename to tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh
index 32c66dae04..a4b44bd1f1 100755
--- a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.sh
+++ b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh
@@ -4,8 +4,8 @@ source $SCRIPT_DIR/common.env
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
-STEPS_PER_RUN=500
-MAX_STEPS=500
+STEPS_PER_RUN=250
+MAX_STEPS=250
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=15
 # ===== END CONFIG =====
@@ -34,8 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 2.4' \
-        'data["train/loss"]["500"] < 0.5' \
-        'max(data["ray/node.0.gpu.0.mem_gb"]) < 25'
+        'data["train/loss"]["1"] < 0.82' \
+        'data["train/loss"]["250"] < 0.5' \
+        'max(data["ray/node.0.gpu.0.mem_gb"]) < 25' \
+        'mean(data["timing/train/total_step_time"], -6, -1) < 0.6'
 fi
 
diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh
similarity index 96%
rename from tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh
rename to tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh
index 257add6fc5..d16a3d8d98 100755
--- a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh
+++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh
@@ -37,7 +37,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 1.5' \
+        'data["train/loss"]["1"] < 0.37' \
         'data["train/loss"]["20"] < 0.3' \
         'max(data["ray/node.0.gpu.0.mem_gb"]) < 35'
 fi 
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
index 07c3eb5b9c..def1d87d94 100644
--- a/tests/test_suites/nightly.txt
+++ b/tests/test_suites/nightly.txt
@@ -10,29 +10,41 @@ tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh
 # Dtensor (Qwen/Qwen2.5-7B-Instruct)
 tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.sh
 
+# Megatron
+tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh
+tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh
+
 # Functional 32b run
 tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.sh
 
+# Functional moonlight run 
+tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh
+
 # Deepscaler (short tests)
 tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh
 tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh
 tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh
+>>>>>>> 2b87def7971f01d6060a5dfc3b9e2df58f832922
 
 #######
 # SFT #
 #######
 
 # 1N 1B/8B runs
-tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.sh
+tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh
 
 # Dtensor (8B)
-tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh
+tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh
+# dynamic batching
+tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh
 
 # Functional 32b test
-tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh
+tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh
 
 # Megatron
-tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-megatron.sh
+tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh
+# sequence packing
+tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh
 
 #######
 # DPO #
diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt
index e339ef0bc1..e58f5b4d71 100644
--- a/tests/test_suites/release.txt
+++ b/tests/test_suites/release.txt
@@ -11,17 +11,23 @@ tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.sh
 # Long Gemma3 27b run
 tests/test_suites/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.sh
 
+# Long Megatron Qwen3 30B-A3B run
+tests/test_suites/llm/grpo-qwen3-30ba3b-16n8g-megatron.sh
+
 #######
 # SFT #
 #######
 
 # Long 8b convergence
-tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh
+tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh
+
+# 300 step 70b convergence
+tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh
 
 #######
 # DPO #
 #######
 
 # Long 8b convergence
-tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.sh
-tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.sh
+tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh
+tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh
diff --git a/tests/unit/models/megatron/converters/test_common.py b/tests/unit/models/megatron/converters/test_common.py
new file mode 100755
index 0000000000..d98f1fe905
--- /dev/null
+++ b/tests/unit/models/megatron/converters/test_common.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest.mock import Mock, patch
+
+import torch
+
+from nemo_rl.models.megatron.converters.common import (
+    get_global_expert_num,
+    get_global_key_from_local_key,
+    get_global_layer_num,
+    get_local_expert_num,
+    get_local_layer_num,
+    split_fc1_etp,
+    split_fc1_tp,
+    split_qkv_bias_gpu,
+    split_qkv_gpu,
+    update_transforms_for_nemorl,
+)
+
+
+class TestLayerNumberFunctions:
+    """Test functions related to layer number extraction and conversion."""
+
+    def test_get_local_layer_num_valid(self):
+        """Test get_local_layer_num with valid layer keys."""
+        assert get_local_layer_num("layers.5.attention.weight") == 5
+        assert get_local_layer_num("decoder.layers.10.mlp.weight") == 10
+        assert get_local_layer_num("model.layers.0.self_attn.weight") == 0
+
+    def test_get_local_layer_num_invalid(self):
+        """Test get_local_layer_num with invalid layer keys."""
+        assert get_local_layer_num("attention.weight") is None
+        assert get_local_layer_num("layers.abc.weight") is None
+        assert get_local_layer_num("layers.") is None
+
+    def test_get_global_layer_num_pp(self):
+        """Test get_global_layer_num with simple pipeline configuration."""
+        mock_cfg = Mock()
+        mock_cfg.num_layers = 10
+        mock_cfg.num_layers_in_first_pipeline_stage = 4
+        mock_cfg.num_layers_in_last_pipeline_stage = 3
+
+        with patch(
+            "nemo_rl.models.megatron.converters.common.parallel_state"
+        ) as mock_ps:
+            mock_ps.get_pipeline_model_parallel_rank.return_value = 1
+            mock_ps.get_pipeline_model_parallel_world_size.return_value = 3
+
+            result = get_global_layer_num("layers.2.weight", mock_cfg)
+            assert result == 6
+
+
+class TestExpertNumberFunctions:
+    """Test functions related to expert number extraction and conversion."""
+
+    def test_get_local_expert_num_valid(self):
+        """Test get_local_expert_num with valid expert keys."""
+        assert get_local_expert_num("layers.0.mlp.experts.weight2") == 2
+        assert get_local_expert_num("decoder.layers.1.experts.weight5") == 5
+        assert get_local_expert_num("model.layers.0.experts.weight0") == 0
+
+    def test_get_local_expert_num_invalid(self):
+        """Test get_local_expert_num with invalid expert keys."""
+        assert get_local_expert_num("layers.0.mlp.weight") is None
+        assert get_local_expert_num("layers.0.mlp.experts.2._extra_state") is None
+
+    def test_get_global_expert_num(self):
+        """Test get_global_expert_num with expert parallel configuration."""
+        mock_cfg = Mock()
+        mock_cfg.num_moe_experts = 8
+
+        with patch(
+            "nemo_rl.models.megatron.converters.common.parallel_state"
+        ) as mock_ps:
+            mock_ps.get_expert_model_parallel_rank.return_value = 1
+            mock_ps.get_expert_model_parallel_world_size.return_value = 2
+
+            result = get_global_expert_num("layers.0.mlp.experts.weight2", mock_cfg)
+            assert result == 6  # 8 // 2 + 2
+
+
+class TestKeyConversionFunctions:
+    """Test functions related to key conversion between local and global."""
+
+    def test_get_global_key_from_local_key_layer_only(self):
+        """Test key conversion with only layer numbers."""
+        mock_cfg = Mock()
+        mock_cfg.num_layers = 12
+        mock_cfg.num_layers_in_first_pipeline_stage = None
+        mock_cfg.num_layers_in_last_pipeline_stage = None
+
+        with patch(
+            "nemo_rl.models.megatron.converters.common.parallel_state"
+        ) as mock_ps:
+            mock_ps.get_pipeline_model_parallel_rank.return_value = 1
+            mock_ps.get_pipeline_model_parallel_world_size.return_value = 2
+
+            result = get_global_key_from_local_key(
+                "layers.3.attention.weight", mock_cfg
+            )
+            assert result == "layers.9.attention.weight"
+
+    def test_get_global_key_from_local_key_expert_and_layer(self):
+        """Test key conversion with only expert numbers."""
+        mock_cfg = Mock()
+        mock_cfg.num_moe_experts = 8
+        mock_cfg.num_layers = 12
+        mock_cfg.num_layers_in_first_pipeline_stage = None
+        mock_cfg.num_layers_in_last_pipeline_stage = None
+
+        with patch(
+            "nemo_rl.models.megatron.converters.common.parallel_state"
+        ) as mock_ps:
+            mock_ps.get_expert_model_parallel_rank.return_value = 1
+            mock_ps.get_expert_model_parallel_world_size.return_value = 2
+
+            mock_ps.get_pipeline_model_parallel_rank.return_value = 1
+            mock_ps.get_pipeline_model_parallel_world_size.return_value = 3
+
+            result = get_global_key_from_local_key(
+                "layers.0.mlp.experts.weight2", mock_cfg
+            )
+            assert result == "layers.4.mlp.experts.weight6"
+
+
+class TestTensorSplittingFunctions:
+    """Test functions related to tensor splitting operations."""
+
+    def test_split_fc1_tp(self):
+        """Test split_fc1_tp function."""
+        mock_ctx = Mock()
+        mock_ctx.source.config.tensor_model_parallel_size = 2
+
+        # Create a tensor with shape (4, 10) representing 2 TP ranks with 2 components each
+        linear_fc1 = torch.randn(4, 10)
+
+        gate_proj, up_proj = split_fc1_tp(mock_ctx, linear_fc1)
+
+        assert gate_proj.shape == (2, 10)
+        assert up_proj.shape == (2, 10)
+        assert torch.allclose(gate_proj, linear_fc1[::2])
+        assert torch.allclose(up_proj, linear_fc1[1::2])
+
+    def test_split_fc1_etp(self):
+        """Test split_fc1_etp function."""
+        mock_ctx = Mock()
+        mock_ctx.source.config.expert_tensor_parallel_size = 2
+
+        # Create a tensor with shape (4, 10) representing 2 ETP ranks with 2 components each
+        linear_fc1 = torch.randn(4, 10)
+
+        gate_proj, up_proj = split_fc1_etp(mock_ctx, linear_fc1)
+
+        assert gate_proj.shape == (2, 10)
+        assert up_proj.shape == (2, 10)
+        assert torch.allclose(gate_proj, linear_fc1[::2])
+        assert torch.allclose(up_proj, linear_fc1[1::2])
+
+    def test_split_qkv_gpu(self):
+        """Test split_qkv_gpu function."""
+        mock_ctx = Mock()
+        mock_ctx.source.config.num_attention_heads = 8
+        mock_ctx.source.config.num_query_groups = 2
+        mock_ctx.source.config.kv_channels = 16
+
+        # Create QKV tensor: (heads + 2*groups) * head_size * hidden_size
+        qkv_total_dim = 8 + 2 * 2  # 12
+        linear_qkv = torch.randn(qkv_total_dim, 16, 64)
+
+        q_proj, k_proj, v_proj = split_qkv_gpu(mock_ctx, linear_qkv)
+
+        # Q should have 8 heads * 16 channels = 128
+        assert q_proj.shape == (128, 64)
+        # K and V should have 2 groups * 16 channels = 32 each
+        assert k_proj.shape == (32, 64)
+        assert v_proj.shape == (32, 64)
+
+    def test_split_qkv_bias_gpu(self):
+        """Test split_qkv_bias_gpu function."""
+        mock_ctx = Mock()
+        mock_ctx.source.config.num_attention_heads = 8
+        mock_ctx.source.config.num_query_groups = 2
+        mock_ctx.source.config.kv_channels = 16
+
+        # Create QKV bias tensor: (heads + 2*groups) * head_size
+        qkv_total_dim = 8 + 2 * 2  # 12
+        qkv_bias = torch.randn(qkv_total_dim, 16)
+
+        q_bias, k_bias, v_bias = split_qkv_bias_gpu(mock_ctx, qkv_bias)
+
+        # Q should have 8 heads * 16 channels = 128
+        assert q_bias.shape == (128,)
+        # K and V should have 2 groups * 16 channels = 32 each
+        assert k_bias.shape == (32,)
+        assert v_bias.shape == (32,)
+
+
+class TestTransformUpdateFunctions:
+    """Test functions related to transform updates."""
+
+    def test_update_transforms_for_nemorl(self):
+        """Test update_transforms_for_nemorl function."""
+        # Create mock transforms
+        mock_transform1 = Mock()
+        mock_transform1.transform.__name__ = "split_fc1"
+        mock_transform1.source_key = "layers.0.mlp.experts.0.linear_fc1.weight"
+
+        mock_transform2 = Mock()
+        mock_transform2.transform.__name__ = "split_fc1"
+        mock_transform2.source_key = "layers.0.mlp.shared_experts.linear_fc1.weight"
+
+        mock_transform3 = Mock()
+        mock_transform3.transform.__name__ = "split_qkv"
+
+        mock_transform4 = Mock()
+        mock_transform4.transform.__name__ = "split_qkv_bias"
+
+        transforms = [
+            mock_transform1,
+            mock_transform2,
+            mock_transform3,
+            mock_transform4,
+        ]
+
+        updated_transforms = update_transforms_for_nemorl(transforms)
+
+        # Check that expert transforms use split_fc1_etp
+        assert updated_transforms[0].transform == split_fc1_etp
+        # Check that non-expert transforms use split_fc1_tp
+        assert updated_transforms[1].transform == split_fc1_tp
+        # Check that qkv transforms are updated
+        assert updated_transforms[2].transform == split_qkv_gpu
+        assert updated_transforms[3].transform == split_qkv_bias_gpu

From 3fe289816aa6591671758d67e22d73671953265a Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 12 Aug 2025 14:55:56 -0700
Subject: [PATCH 02/12] rename unit test file to avoid name conflict

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 .../models/megatron/{test_common.py => test_megatron_common.py}   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/unit/models/megatron/{test_common.py => test_megatron_common.py} (100%)

diff --git a/tests/unit/models/megatron/test_common.py b/tests/unit/models/megatron/test_megatron_common.py
similarity index 100%
rename from tests/unit/models/megatron/test_common.py
rename to tests/unit/models/megatron/test_megatron_common.py

From 473f408aac38eab3e9557109c8aaf49c8c3c4af7 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 13 Aug 2025 10:35:16 -0700
Subject: [PATCH 03/12] rename tests

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 .../converters/{test_common.py => test_converters_common.py}      | 0
 .../models/megatron/{test_megatron_common.py => test_common.py}   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/unit/models/megatron/converters/{test_common.py => test_converters_common.py} (100%)
 rename tests/unit/models/megatron/{test_megatron_common.py => test_common.py} (100%)

diff --git a/tests/unit/models/megatron/converters/test_common.py b/tests/unit/models/megatron/converters/test_converters_common.py
similarity index 100%
rename from tests/unit/models/megatron/converters/test_common.py
rename to tests/unit/models/megatron/converters/test_converters_common.py
diff --git a/tests/unit/models/megatron/test_megatron_common.py b/tests/unit/models/megatron/test_common.py
similarity index 100%
rename from tests/unit/models/megatron/test_megatron_common.py
rename to tests/unit/models/megatron/test_common.py

From 23966de8955938b0a9d868134afa71d6a256818f Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 14 Aug 2025 09:03:17 -0700
Subject: [PATCH 04/12] fix unit tests

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 .../unit/models/megatron/converters/test_converters_common.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/unit/models/megatron/converters/test_converters_common.py b/tests/unit/models/megatron/converters/test_converters_common.py
index d98f1fe905..1177f53af1 100755
--- a/tests/unit/models/megatron/converters/test_converters_common.py
+++ b/tests/unit/models/megatron/converters/test_converters_common.py
@@ -14,6 +14,7 @@
 
 from unittest.mock import Mock, patch
 
+import pytest
 import torch
 
 from nemo_rl.models.megatron.converters.common import (
@@ -29,6 +30,9 @@
     update_transforms_for_nemorl,
 )
 
+# Apply mcore marker to all tests in this module
+pytestmark = pytest.mark.mcore
+
 
 class TestLayerNumberFunctions:
     """Test functions related to layer number extraction and conversion."""

From 1725226e50757b39bf07b3a0c085788d8ad8a286 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 14 Aug 2025 15:17:10 -0700
Subject: [PATCH 05/12] remove debug code

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 tests/check_metrics.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/check_metrics.py b/tests/check_metrics.py
index df97d22a1f..bc9f6ced04 100644
--- a/tests/check_metrics.py
+++ b/tests/check_metrics.py
@@ -55,7 +55,6 @@ def mean(value, range_start=1, range_end=0):
         if range_start <= int(step) and int(step) < range_end:
             vals.append(float(v))
 
-    print(vals)
     return statistics.mean(vals)
 
 

From 3a4d9a8e1c54f8749d4fcb7af5f79c971ea6d39f Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 14 Aug 2025 15:23:58 -0700
Subject: [PATCH 06/12] fix import error

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 .../converters/test_converters_common.py      | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/tests/unit/models/megatron/converters/test_converters_common.py b/tests/unit/models/megatron/converters/test_converters_common.py
index 1177f53af1..c8731eb573 100755
--- a/tests/unit/models/megatron/converters/test_converters_common.py
+++ b/tests/unit/models/megatron/converters/test_converters_common.py
@@ -17,18 +17,21 @@
 import pytest
 import torch
 
-from nemo_rl.models.megatron.converters.common import (
-    get_global_expert_num,
-    get_global_key_from_local_key,
-    get_global_layer_num,
-    get_local_expert_num,
-    get_local_layer_num,
-    split_fc1_etp,
-    split_fc1_tp,
-    split_qkv_bias_gpu,
-    split_qkv_gpu,
-    update_transforms_for_nemorl,
-)
+try:
+    from nemo_rl.models.megatron.converters.common import (
+        get_global_expert_num,
+        get_global_key_from_local_key,
+        get_global_layer_num,
+        get_local_expert_num,
+        get_local_layer_num,
+        split_fc1_etp,
+        split_fc1_tp,
+        split_qkv_bias_gpu,
+        split_qkv_gpu,
+        update_transforms_for_nemorl,
+    )
+except ImportError:
+    pass
 
 # Apply mcore marker to all tests in this module
 pytestmark = pytest.mark.mcore

From ad3f1d540211978f0a24e8f5a7bf2db28b69d7dd Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 14 Aug 2025 22:00:06 -0700
Subject: [PATCH 07/12] fix config keys

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/configs/dpo.yaml                     |   1 -
 ....1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml |   1 -
 ...po-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml |   1 -
 ...llama3.1-8b-instruct-4n8g-megatron.v2.yaml |   1 -
 ...8b-instruct-4n8g-megatrontp2pp2-quick.yaml |   1 -
 ...llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml |   1 -
 ...po-llama3.2-1b-instruct-1n8g-megatron.yaml |   2 -
 ...rpo-qwen2.5-7b-instruct-4n8g-megatron.yaml |   2 -
 .../llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml  |   2 -
 ...lama3.1-70b-8n8g-tp4pp2-long-megatron.yaml |   4 +-
 ...lama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml |   2 -
 .../sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml   |   1 -
 .../llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml  |   1 -
 ...sft-llama3.1-8b-1n8g-megatron-seqpack.yaml |   2 -
 .../llm/sft-llama3.1-8b-1n8g-megatron.yaml    |   1 -
 .../llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml |   1 -
 ...wen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml |   1 -
 examples/configs/rm.yaml                      |   2 -
 examples/configs/sft.yaml                     |   1 -
 examples/configs/sft_openmathinstruct2.yaml   |   1 -
 .../sft_openmathinstruct2_megatron.yaml       | 149 ++++++++++++++++++
 nemo_rl/data/__init__.py                      |   1 +
 nemo_rl/utils/logger.py                       |   1 -
 23 files changed, 151 insertions(+), 29 deletions(-)
 create mode 100644 examples/configs/sft_openmathinstruct2_megatron.yaml

diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml
index ecc159b484..74a74efc21 100755
--- a/examples/configs/dpo.yaml
+++ b/examples/configs/dpo.yaml
@@ -158,7 +158,6 @@ logger:
   tensorboard_enabled: false
   mlflow_enabled: false  # Disable MLflow logging
   monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: "dpo-dev"
     name: "dpo"
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
index b7040abb37..72dcb9ad1e 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
@@ -82,7 +82,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml
index 01395f5247..22851b368c 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml
@@ -82,7 +82,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
index 49e8a78422..1960502a09 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
@@ -115,7 +115,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
index 37d20bbcdb..987e70dc88 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
@@ -115,7 +115,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
diff --git a/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml
index 1e35e24d4e..22870f0e66 100644
--- a/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml
@@ -83,7 +83,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
index 55de4c59f4..8f17d32819 100755
--- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
@@ -36,8 +36,6 @@ policy:
   logprob_batch_size: 4
   max_total_sequence_length: 512
   precision: bfloat16
-  fsdp_offload_enabled: false
-  activation_checkpointing_enabled: false
   optimizer: null
   megatron_cfg:
     enabled: true
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
index d099931839..13689e6ddc 100755
--- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
@@ -36,8 +36,6 @@ policy:
   logprob_batch_size: 2
   max_total_sequence_length: 4096
   precision: bfloat16
-  fsdp_offload_enabled: false
-  activation_checkpointing_enabled: false
   dtensor_cfg:
     enabled: false
   megatron_cfg:
diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
index 89e434d3dc..048ed32782 100755
--- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
@@ -38,8 +38,6 @@ policy:
   logprob_batch_size: 4
   max_total_sequence_length: 4096
   precision: "bfloat16"
-  fsdp_offload_enabled: false
-  activation_checkpointing_enabled: false
 
   dtensor_cfg:
     enabled: false
diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
index 08b015512a..cd5751f523 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
@@ -23,8 +23,6 @@ policy:
   train_micro_batch_size: 1
   max_total_sequence_length: 4096
   precision: "bfloat16"
-  fsdp_offload_enabled: false
-  activation_checkpointing_enabled: false
   dtensor_cfg:
     enabled: false
   megatron_cfg:
@@ -114,8 +112,8 @@ data:
   add_eos: true
   add_generation_prompt: true
   output_key: 'generated_solution'
-  seed: 42
   shuffle: true
+  seed: 42
 logger:
   log_dir: "logs"  # Base directory for all logs
   wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
index b04201ac9f..d7906b82e0 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
@@ -23,8 +23,6 @@ policy:
   train_micro_batch_size: 2
   max_total_sequence_length: 4096
   precision: bfloat16
-  fsdp_offload_enabled: false
-  activation_checkpointing_enabled: false
   dtensor_cfg:
     enabled: true
     cpu_offload: false
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
index 20adece59d..1fc0ccec7c 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
@@ -70,7 +70,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
index e9abb0771f..8c3f14b531 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
@@ -68,7 +68,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
index c2791b3074..4ad9355446 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
@@ -23,8 +23,6 @@ policy:
   train_micro_batch_size: 2
   max_total_sequence_length: 4096
   precision: bfloat16
-  fsdp_offload_enabled: false
-  activation_checkpointing_enabled: false
   dtensor_cfg:
     enabled: false
   dynamic_batching:
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
index 28331f4c69..e5e86dd302 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
@@ -112,7 +112,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: sft-llama3.1-8b-1n8g-megatron
diff --git a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml
index de2e6f9eee..165e2fa9a3 100644
--- a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml
@@ -69,7 +69,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: sft-llama3.2-1b-1n8g-fsdp2tp1
diff --git a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml
index 85f4e7f111..800d94711e 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml
@@ -68,7 +68,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt
diff --git a/examples/configs/rm.yaml b/examples/configs/rm.yaml
index 20d4cf6a18..4adffdc5d7 100644
--- a/examples/configs/rm.yaml
+++ b/examples/configs/rm.yaml
@@ -31,8 +31,6 @@ policy:
   train_micro_batch_size: 1
   max_total_sequence_length: 8192
   precision: "bfloat16"
-  fsdp_offload_enabled: false
-  activation_checkpointing_enabled: false
 
   reward_model_cfg:
     enabled: true  # loads model as a Reward Model (do not change)
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 01864d7691..cd7232527c 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -141,7 +141,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: "sft-dev"
     name: "sft-dev-${data.dataset_name}"
diff --git a/examples/configs/sft_openmathinstruct2.yaml b/examples/configs/sft_openmathinstruct2.yaml
index bbea9f1767..09354a2039 100644
--- a/examples/configs/sft_openmathinstruct2.yaml
+++ b/examples/configs/sft_openmathinstruct2.yaml
@@ -82,7 +82,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: "sft-dev"
     name: "openmathinstruct-nemorl-1M_train"
diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml
new file mode 100644
index 0000000000..17b7ddeaee
--- /dev/null
+++ b/examples/configs/sft_openmathinstruct2_megatron.yaml
@@ -0,0 +1,149 @@
+# SFT Algorithm Configuration
+defaults: sft_openmathinstruct2.yaml
+
+sft:
+  max_num_epochs: 1
+  max_num_steps: 1000000
+  val_period: 500
+  val_batches: 4
+  val_global_batch_size: 128
+  val_micro_batch_size: 1
+  val_at_start: true
+  seed: 42
+
+checkpointing:
+  enabled: true
+  checkpoint_dir: "results/sft_openmathinstruct2"
+  metric_name: "val_loss"
+  higher_is_better: false
+  keep_top_k: 100
+  save_period: 500
+
+policy:
+  model_name: "meta-llama/Llama-3.1-8B"
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  max_total_sequence_length: 4096
+  precision: "bfloat16"
+
+  dtensor_cfg:
+    enabled: false
+
+  megatron_cfg:
+    activation_checkpointing: false
+    context_parallel_size: 1
+    distributed_data_parallel_config:
+      average_in_collective: true
+      data_parallel_sharding_strategy: optim_grads_params
+      grad_reduce_in_fp32: true
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+    empty_unused_memory_level: 1
+    enabled: true
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    optimizer:
+      adam_beta1: 0.9
+      adam_beta2: 0.98
+      adam_eps: 1.0e-8
+      bf16: true
+      clip_grad: 0
+      fp16: false
+      lr: 0.00002
+      min_lr: 0.00002
+      optimizer: adam
+      params_dtype: bfloat16
+      sgd_momentum: 0.9
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: false #true ## TODO: precision aware optim not working with fp8. Is this expected?
+      weight_decay: 0.01
+
+      ## recently introduced, our current mcore commit doesn't have this
+      #fp8_recipe: delayed
+
+    pipeline_dtype: bfloat16
+    pipeline_model_parallel_size: 1
+    scheduler:
+      end_weight_decay: 0.01
+      lr_decay_iters: null
+      lr_decay_style: constant
+      lr_warmup_init: 0.00001999999
+      lr_warmup_iters: 1
+      start_weight_decay: 0.01
+      weight_decay_incr_style: constant
+    sequence_parallel: false
+    tensor_model_parallel_size: 4 ## TODO: should not need this large TP size
+
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    #gives ~20% training perf speedup with sequence packing
+    apply_rope_fusion: True
+
+    env_vars:
+      PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
+
+    fp8_cfg:
+      enabled: true
+      fp8: hybrid
+      fp8_recipe: delayed
+      fp8_param: true # false gives the following error: "RuntimeError: /TransformerEngine/transformer_engine/common/gemm/cublaslt_gemm.cu:116 in function CanonicalizeGemmInput: Assertion failed: !is_fp8_dtype(ret.Atype). Input A is missing column-wise usage"
+      fp8_dot_product_attention: false #true
+      fp8_multi_head_attention: false #true
+
+  dynamic_batching:
+    enabled: false
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    sequence_length_round: 64
+
+
+  sequence_packing:
+    enabled: True
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+
+  # makes the training sequence length divisible by the tensor parallel size
+  # this is useful for sequence parallel training
+  make_sequence_length_divisible_by: ${mul:16, ${policy.megatron_cfg.tensor_model_parallel_size}}
+  max_grad_norm: null
+
+  optimizer: null
+
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: "openmathinstruct2"
+  prompt_file: examples/prompts/math.txt
+  split: "train_1M"
+  add_bos: true
+  add_eos: true
+  add_generation_prompt: true
+  output_key: 'generated_solution'
+
+logger:
+  log_dir: "logs"  # Base directory for all logs
+  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
+  tensorboard_enabled: true
+  mlflow_enabled: false
+  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  wandb:
+    project: "sft-openmathinstruct-megatron"
+    name: "llama8b"
+  tensorboard:
+    log_dir: "tb_logs-openmathinstruct-nemorl-1M_train"
+  mlflow:
+    experiment_name: "sft-dev"
+    run_name: "openmathinstruct-nemorl-1M_train"
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+
+cluster:
+  gpus_per_node: 8
+  num_nodes: 2
+                                                  
diff --git a/nemo_rl/data/__init__.py b/nemo_rl/data/__init__.py
index df14a1546f..257a2db66f 100644
--- a/nemo_rl/data/__init__.py
+++ b/nemo_rl/data/__init__.py
@@ -29,6 +29,7 @@ class DataConfig(TypedDict):
     add_system_prompt: NotRequired[bool]
     split: NotRequired[str]
     shuffle: NotRequired[bool]
+    seed: NotRequired[int]
 
 
 class MathDataConfig(DataConfig):
diff --git a/nemo_rl/utils/logger.py b/nemo_rl/utils/logger.py
index 4cf2621cd4..e70af2117f 100644
--- a/nemo_rl/utils/logger.py
+++ b/nemo_rl/utils/logger.py
@@ -76,7 +76,6 @@ class LoggerConfig(TypedDict):
     mlflow: NotRequired[MLflowConfig]
     monitor_gpus: bool
     gpu_monitoring: GPUMonitoringConfig
-    num_val_samples_to_print: int
 
 
 class LoggerInterface(ABC):

From 59c896b39a1bdfaa1581e29bb0c80733a29ba680 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 20 Aug 2025 08:19:13 -0700
Subject: [PATCH 08/12] fix conflict

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 tests/test_suites/nightly.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
index 9abed7b02e..e4c488bf6f 100644
--- a/tests/test_suites/nightly.txt
+++ b/tests/test_suites/nightly.txt
@@ -28,7 +28,6 @@ tests/test_suites/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.
 tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh
 tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh
 tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh
->>>>>>> 2b87def7971f01d6060a5dfc3b9e2df58f832922
 
 # GRPO math test run (32K context mcore)
 tests/test_suites/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.sh

From 30328425c44d69841e63121fa8496c8656ee0928 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 20 Aug 2025 11:56:48 -0700
Subject: [PATCH 09/12] fix unit test

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/configs/grpo_math_1B.yaml |  2 ++
 examples/configs/sft.yaml          | 10 +++++++++-
 tests/test_suites/release.txt      |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index b797afee17..384041607e 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -114,6 +114,8 @@ policy:
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
+    env_vars: null
+
   # See docs/design-docs/sequence-packing-and-dynamic-batching.md 
   # for more details on dynamic batching and sequence packing.
   dynamic_batching:
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index cd7232527c..2319568475 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -42,6 +42,8 @@ policy:
 
   dynamic_batching:
     enabled: false
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    sequence_length_round: 64
 
   sequence_packing:
     enabled: False
@@ -125,7 +127,7 @@ policy:
       overlap_param_gather: true
       average_in_collective: true
       data_parallel_sharding_strategy: "optim_grads_params"
-
+      use_custom_fsdp: false
 
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
@@ -135,6 +137,12 @@ data:
   add_generation_prompt: false
   shuffle: true
 
+  ## unused with squad dataset
+  prompt_file: null
+  split: null
+  output_key: null
+  seed: null
+
 logger:
   log_dir: "logs"  # Base directory for all logs
   wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt
index 2afd3ccafe..bd117a83f8 100644
--- a/tests/test_suites/release.txt
+++ b/tests/test_suites/release.txt
@@ -12,7 +12,7 @@ tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.sh
 tests/test_suites/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.sh
 
 # Long Megatron Qwen3 30B-A3B run
-tests/test_suites/llm/grpo-qwen3-30ba3b-16n8g-megatron.sh
+tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh
 
 #######
 # SFT #

From fd723ea13bd5b6b26d0ca655b2fa225e3015f308 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 21 Aug 2025 09:22:27 -0700
Subject: [PATCH 10/12] fix empty env vars

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/configs/grpo_math_1B.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 384041607e..2d22a94ffe 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -114,7 +114,7 @@ policy:
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
-    env_vars: null
+    env_vars: {}
 
   # See docs/design-docs/sequence-packing-and-dynamic-batching.md 
   # for more details on dynamic batching and sequence packing.

From 31d832a2b211ec07c9d0cfac89eebc6637fea924 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 21 Aug 2025 15:13:23 -0700
Subject: [PATCH 11/12] re-disable expandable segments for qwen30ba3b

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/configs/grpo_math_qwen30ba3b_megatron.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
index 3040d20ffc..1a0cc651c7 100644
--- a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
+++ b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
@@ -55,7 +55,10 @@ policy:
       lr_decay_iters: null
       lr_warmup_iters: 13
       lr_warmup_init: 3.0e-8
-    
+
+    env_vars:
+      PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
+
   generation:
     backend: "vllm"
     max_new_tokens: ${policy.max_total_sequence_length}

From 84b460b34ffa7260c95b8af7a7be601cb410435f Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 22 Aug 2025 12:28:49 -0700
Subject: [PATCH 12/12] fix null env vars

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/configs/grpo_math_1B.yaml | 2 +-
 nemo_rl/models/policy/lm_policy.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 2d22a94ffe..384041607e 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -114,7 +114,7 @@ policy:
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
-    env_vars: {}
+    env_vars: null
 
   # See docs/design-docs/sequence-packing-and-dynamic-batching.md 
   # for more details on dynamic batching and sequence packing.
diff --git a/nemo_rl/models/policy/lm_policy.py b/nemo_rl/models/policy/lm_policy.py
index c853e200d0..754faea5b7 100644
--- a/nemo_rl/models/policy/lm_policy.py
+++ b/nemo_rl/models/policy/lm_policy.py
@@ -133,7 +133,7 @@ def __init__(
             name_prefix=name_prefix,
             workers_per_node=workers_per_node,
             sharding_annotations=self.sharding_annotations,
-            env_vars=env_vars,
+            env_vars=env_vars or {},
         )
 
         if config["dynamic_batching"]["enabled"]: