NVIDIA-NeMo · terrykong · Aug 23, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 13, 2025
@@ -158,7 +158,6 @@ logger:
   tensorboard_enabled: false
   mlflow_enabled: false  # Disable MLflow logging
   monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: "dpo-dev"
     name: "dpo"

@@ -114,6 +114,8 @@ policy:
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
+    env_vars: null
+
   # See docs/design-docs/sequence-packing-and-dynamic-batching.md 
   # for more details on dynamic batching and sequence packing.
   dynamic_batching:

@@ -55,7 +55,10 @@ policy:
       lr_decay_iters: null
       lr_warmup_iters: 13
       lr_warmup_init: 3.0e-8
-
+
+    env_vars:
+      PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
+
   generation:
     backend: "vllm"
     max_new_tokens: ${policy.max_total_sequence_length}

@@ -82,7 +82,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1

@@ -20,7 +20,7 @@ checkpointing:
   metric_name: "val_loss"
   higher_is_better: false
   keep_top_k: 3
-  save_period: 10000
+  save_period: 50
   checkpoint_must_save_by: null
 
 policy:
@@ -29,14 +29,14 @@ policy:
     name: ${policy.model_name}
   train_global_batch_size: 256
   train_micro_batch_size: 1
-  max_total_sequence_length: 2048
+  max_total_sequence_length: 8192
   precision: "bfloat16"
   dtensor_cfg:
     enabled: true
     cpu_offload: False
     sequence_parallel: false
     activation_checkpointing: false
-    tensor_parallel_size: 1
+    tensor_parallel_size: 4
     context_parallel_size: 1
     custom_parallel_plan: null
 
@@ -82,10 +82,9 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
-    name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
+    name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4
   tensorboard: {}
   gpu_monitoring:
     collection_interval: 10

@@ -20,7 +20,7 @@ checkpointing:
   metric_name: "val_loss"
   higher_is_better: false
   keep_top_k: 3
-  save_period: 10000
+  save_period: 50
   checkpoint_must_save_by: null
 
 policy:
@@ -29,7 +29,7 @@ policy:
     name: ${policy.model_name}
   train_global_batch_size: 256
   train_micro_batch_size: 1
-  max_total_sequence_length: 2048
+  max_total_sequence_length: 8192
   precision: "bfloat16"
   dtensor_cfg:
     enabled: false
@@ -49,7 +49,7 @@ policy:
     enabled: true
     empty_unused_memory_level: 1
     activation_checkpointing: false
-    tensor_model_parallel_size: 2
+    tensor_model_parallel_size: 4
     expert_tensor_parallel_size: 1
     expert_model_parallel_size: 1
     pipeline_model_parallel_size: 1
@@ -115,10 +115,9 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
-    name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
+    name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron
   tensorboard: {}
   gpu_monitoring:
     collection_interval: 10

@@ -115,7 +115,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1

@@ -83,7 +83,6 @@ logger:
   tensorboard_enabled: true
   mlflow_enabled: false
   monitor_gpus: true
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
   wandb:
     project: nemo-rl
     name: dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1

@@ -0,0 +1,159 @@
+grpo:
+  num_prompts_per_step: 32
+  num_generations_per_prompt: 16
+  max_rollout_turns: 1
+  max_num_steps: 500
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+  seed: 42
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_clip_min: 0.2
+  ratio_clip_max: 0.2
+  ratio_clip_c: null
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+  token_level_loss: true
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-megatron
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 100
+  checkpoint_must_save_by: null
+policy:
+  model_name: meta-llama/Llama-3.2-1B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.2-1B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 4
+  generation_batch_size: 32
+  logprob_batch_size: 4
+  max_total_sequence_length: 512
+  precision: bfloat16
+  optimizer: null
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 0
+    activation_checkpointing: false
+    tensor_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    sequence_parallel: false
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+
+    optimizer:
+      optimizer: "adam"
+      lr: 5.0e-6
+      min_lr: 5.0e-7
+      weight_decay: 0.01
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1e-8
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      clip_grad: ${policy.max_grad_norm}
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: null
+      lr_warmup_iters: 50
+      lr_warmup_init: 5.0e-7
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: "optim_grads_params"
+
+  dtensor_cfg:
+    enabled: false
+  dynamic_batching:
+    enabled: False
+  sequence_packing:
+    enabled: True
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  generation:
+    backend: vllm
+    max_new_tokens: 512
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 128009
+    stop_strings: null
+    vllm_cfg:
+      async_engine: false
+      precision: ${policy.precision}
+      tensor_parallel_size: 1
+      pipeline_parallel_size: 1
+      gpu_memory_utilization: 0.6
+      max_model_len: 512
+      enforce_eager: False
+    colocated:
+      enabled: true
+      resources:
+        gpus_per_node: null
+        num_nodes: null
+data:
+  max_input_seq_length: 512
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+  shuffle: true
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-megatron
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  mlflow_enabled: False
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.2-1b-instruct-1n8g-megatron
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1