Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,38 @@ repos:
require_serial: true
additional_dependencies: []
minimum_pre_commit_version: "2.9.2"

# This pre-commit hook ensures that the config file is minimized and reflects exactly what you
# intend to merge. Without it, you might run experiments with one config, but when merging upstream,
# the config could silently fall back to the base defaults—resulting in different hyperparameters.
#
# For example, we’ve seen cases where an SFT recipe runs without a custom chat_template. When merged,
# it unexpectedly picks up the default recommended chat_template from upstream, which doesn’t match
# the original experiment setup.
#
# If this check is disruptive, you can disable the pre-commit hook locally. However, before a recipe
# is accepted upstream, we expect the config to be minimized.
- repo: local
hooks:
- id: configs-minimize-check-llm
name: minimize-check llm recipes
language: system
pass_filenames: false
entry: bash
args:
- -lc
- |
set -euo pipefail
base="examples/configs/dpo.yaml"; for f in examples/configs/recipes/llm/dpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
base="examples/configs/grpo_math_1B.yaml"; for f in examples/configs/recipes/llm/grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
base="examples/configs/sft.yaml"; for f in examples/configs/recipes/llm/sft-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
- id: configs-minimize-check-vlm
name: minimize-check vlm recipes
language: system
pass_filenames: false
entry: bash
args:
- -lc
- |
set -euo pipefail
base="examples/configs/vlm_grpo_3B.yaml"; for f in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
Original file line number Diff line number Diff line change
@@ -1,95 +1,44 @@
defaults: ../../dpo.yaml
dpo:
max_num_epochs: 2
max_num_steps: 20
val_period: 50
val_batches: 16
val_global_batch_size: 32
val_micro_batch_size: 1
val_at_start: false
seed: 42

reference_policy_kl_penalty: 0.05
preference_average_log_probs: False
sft_average_log_probs: ${.preference_average_log_probs}
preference_loss_weight: 1
sft_loss_weight: 0.01

checkpointing:
enabled: true
checkpoint_dir: "results/dpo"
metric_name: "val_loss"
higher_is_better: false
keep_top_k: 3
save_period: 10000
checkpoint_must_save_by: null

policy:
model_name: "meta-llama/Llama-3.1-8B-Instruct"
model_name: meta-llama/Llama-3.1-8B-Instruct
tokenizer:
name: ${policy.model_name}
train_global_batch_size: 256
train_micro_batch_size: 1
max_total_sequence_length: 2048
precision: "bfloat16"
dtensor_cfg:
enabled: true
cpu_offload: False
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 2
context_parallel_size: 1
custom_parallel_plan: null

dynamic_batching:
enabled: false

sequence_packing:
enabled: false

make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
max_grad_norm: 1.0

optimizer:
name: "torch.optim.AdamW"
kwargs:
lr: 5.0e-6
weight_decay: 0.1
betas: [0.9, 0.98]
eps: 1e-8
foreach: False
fused: False

eps: 1.0e-08
scheduler:
- name: "torch.optim.lr_scheduler.LinearLR"
kwargs:
start_factor: 0.000000001
end_factor: 1.0
total_iters: 1
- name: "torch.optim.lr_scheduler.ConstantLR"
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones: [1]

data:
dataset_name: "HelpSteer3"
max_input_seq_length: ${policy.max_total_sequence_length}
shuffle: true

- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 1.0e-09
end_factor: 1.0
total_iters: 1
- name: torch.optim.lr_scheduler.ConstantLR
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones:
- 1
logger:
log_dir: "logs"
wandb_enabled: true
tensorboard_enabled: true
mlflow_enabled: false
monitor_gpus: true
wandb:
project: nemo-rl
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
tensorboard: {}
gpu_monitoring:
collection_interval: 10
flush_interval: 10

cluster:
gpus_per_node: 8
num_nodes: 4
Original file line number Diff line number Diff line change
@@ -1,95 +1,40 @@
defaults: ../../dpo.yaml
dpo:
max_num_epochs: 1
max_num_steps: 150
val_period: 50
val_batches: 16
val_global_batch_size: 32
val_micro_batch_size: 1
val_at_start: false
seed: 42

reference_policy_kl_penalty: 0.05
preference_average_log_probs: False
sft_average_log_probs: ${.preference_average_log_probs}
preference_loss_weight: 1
sft_loss_weight: 0.01

checkpointing:
enabled: true
checkpoint_dir: "results/dpo"
metric_name: "val_loss"
higher_is_better: false
keep_top_k: 3
save_period: 50
checkpoint_must_save_by: null

policy:
model_name: "meta-llama/Llama-3.1-8B-Instruct"
model_name: meta-llama/Llama-3.1-8B-Instruct
tokenizer:
name: ${policy.model_name}
train_global_batch_size: 256
train_micro_batch_size: 1
max_total_sequence_length: 8192
precision: "bfloat16"
dtensor_cfg:
enabled: true
cpu_offload: False
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 4
context_parallel_size: 1
custom_parallel_plan: null

dynamic_batching:
enabled: false

sequence_packing:
enabled: false

make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
max_grad_norm: 1.0

optimizer:
name: "torch.optim.AdamW"
kwargs:
lr: 5.0e-6
weight_decay: 0.1
betas: [0.9, 0.98]
eps: 1e-8
foreach: False
fused: False

eps: 1.0e-08
scheduler:
- name: "torch.optim.lr_scheduler.LinearLR"
kwargs:
start_factor: 0.000000001
end_factor: 1.0
total_iters: 1
- name: "torch.optim.lr_scheduler.ConstantLR"
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones: [1]

data:
dataset_name: "HelpSteer3"
max_input_seq_length: ${policy.max_total_sequence_length}
shuffle: true

- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 1.0e-09
end_factor: 1.0
total_iters: 1
- name: torch.optim.lr_scheduler.ConstantLR
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones:
- 1
logger:
log_dir: "logs"
wandb_enabled: true
tensorboard_enabled: true
mlflow_enabled: false
monitor_gpus: true
wandb:
project: nemo-rl
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4
tensorboard: {}
gpu_monitoring:
collection_interval: 10
flush_interval: 10

cluster:
gpus_per_node: 8
num_nodes: 4
Original file line number Diff line number Diff line change
@@ -1,128 +1,32 @@
defaults: ../../dpo.yaml
dpo:
max_num_epochs: 1
max_num_steps: 150
val_period: 50
val_batches: 16
val_global_batch_size: 32
val_micro_batch_size: 1
val_at_start: false
seed: 42

reference_policy_kl_penalty: 0.05
preference_average_log_probs: False
sft_average_log_probs: ${.preference_average_log_probs}
preference_loss_weight: 1
sft_loss_weight: 0.01

checkpointing:
enabled: false #true
checkpoint_dir: "results/dpo"
metric_name: "val_loss"
higher_is_better: false
keep_top_k: 3
save_period: 50
checkpoint_must_save_by: null

enabled: false
policy:
model_name: "meta-llama/Llama-3.1-8B-Instruct"
model_name: meta-llama/Llama-3.1-8B-Instruct
tokenizer:
name: ${policy.model_name}
train_global_batch_size: 256
train_micro_batch_size: 1
max_total_sequence_length: 8192
precision: "bfloat16"
dtensor_cfg:
enabled: false

dynamic_batching:
enabled: false

sequence_packing:
enabled: false

make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
max_grad_norm: 1.0

optimizer: null

megatron_cfg:
enabled: true
empty_unused_memory_level: 1
activation_checkpointing: false
tensor_model_parallel_size: 4
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
sequence_parallel: true
freeze_moe_router: false
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

optimizer:
optimizer: "adam"
lr: 5.0e-6 #4.0e-5
min_lr: 5.0e-6 #4.0e-5
weight_decay: 0.1
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.98
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

clip_grad: ${policy.max_grad_norm}

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_warmup_iters: 1
lr_warmup_init: 0.00000001

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
average_in_collective: true
data_parallel_sharding_strategy: "optim_grads_params"

data:
dataset_name: "HelpSteer3"
max_input_seq_length: ${policy.max_total_sequence_length}
shuffle: true

logger:
log_dir: "logs"
wandb_enabled: true
tensorboard_enabled: true
mlflow_enabled: false
monitor_gpus: true
wandb:
project: nemo-rl
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron
tensorboard: {}
gpu_monitoring:
collection_interval: 10
flush_interval: 10

name: dpo-llama3.1-8b-instruct-4n8g-megatron-tp4.v2
cluster:
gpus_per_node: 8
num_nodes: 4
Loading
Loading