Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,36 @@ repos:
require_serial: true
additional_dependencies: []
minimum_pre_commit_version: "2.9.2"

# The rationale behind this pre-commit hook is that we want to ensure the config is minimized and matches
# what you want merge in early otherwise you risk running one experiment, but when you merge the config
Comment thread
terrykong marked this conversation as resolved.
Outdated
# into upstream, you'll merge with the base config and that could be an experiment with different hyperparameters.
# Anecdotally, this has been an issue when a SFT recipe runs without a custom chat_template, but when it merges with
# the default one, it gets our recommended chat_template which is not what comes from the config.
#
# You can disable this pre-commit hook if you find this disruptive, but we will expect that the config
# is minimized before accepting the recipe upstream.
Comment thread
yfw marked this conversation as resolved.
Outdated
- repo: local
hooks:
- id: configs-minimize-check-llm
name: minimize-check llm recipes
language: system
pass_filenames: false
entry: bash
args:
- -lc
- |
set -euo pipefail
base="examples/configs/dpo.yaml"; for f in examples/configs/recipes/llm/dpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
base="examples/configs/grpo_math_1B.yaml"; for f in examples/configs/recipes/llm/grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
base="examples/configs/sft.yaml"; for f in examples/configs/recipes/llm/sft-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
- id: configs-minimize-check-vlm
name: minimize-check vlm recipes
language: system
pass_filenames: false
entry: bash
args:
- -lc
- |
set -euo pipefail
base="examples/configs/vlm_grpo_3B.yaml"; for f in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
Comment thread
terrykong marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -1,95 +1,44 @@
defaults: ../../dpo.yaml
Comment thread
terrykong marked this conversation as resolved.
dpo:
max_num_epochs: 2
max_num_steps: 20
val_period: 50
val_batches: 16
val_global_batch_size: 32
val_micro_batch_size: 1
val_at_start: false
seed: 42

reference_policy_kl_penalty: 0.05
preference_average_log_probs: False
sft_average_log_probs: ${.preference_average_log_probs}
preference_loss_weight: 1
sft_loss_weight: 0.01

checkpointing:
enabled: true
checkpoint_dir: "results/dpo"
metric_name: "val_loss"
higher_is_better: false
keep_top_k: 3
save_period: 10000
checkpoint_must_save_by: null

policy:
model_name: "meta-llama/Llama-3.1-8B-Instruct"
model_name: meta-llama/Llama-3.1-8B-Instruct
tokenizer:
name: ${policy.model_name}
train_global_batch_size: 256
train_micro_batch_size: 1
max_total_sequence_length: 2048
precision: "bfloat16"
dtensor_cfg:
enabled: true
cpu_offload: False
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 2
context_parallel_size: 1
custom_parallel_plan: null

dynamic_batching:
enabled: false

sequence_packing:
enabled: false

make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
max_grad_norm: 1.0

optimizer:
name: "torch.optim.AdamW"
kwargs:
lr: 5.0e-6
weight_decay: 0.1
betas: [0.9, 0.98]
eps: 1e-8
foreach: False
fused: False

eps: 1.0e-08
scheduler:
- name: "torch.optim.lr_scheduler.LinearLR"
kwargs:
start_factor: 0.000000001
end_factor: 1.0
total_iters: 1
- name: "torch.optim.lr_scheduler.ConstantLR"
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones: [1]

data:
dataset_name: "HelpSteer3"
max_input_seq_length: ${policy.max_total_sequence_length}
shuffle: true

- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 1.0e-09
end_factor: 1.0
total_iters: 1
- name: torch.optim.lr_scheduler.ConstantLR
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones:
- 1
logger:
log_dir: "logs"
wandb_enabled: true
tensorboard_enabled: true
mlflow_enabled: false
monitor_gpus: true
wandb:
project: nemo-rl
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
tensorboard: {}
gpu_monitoring:
collection_interval: 10
flush_interval: 10

cluster:
gpus_per_node: 8
num_nodes: 4
Original file line number Diff line number Diff line change
@@ -1,95 +1,40 @@
defaults: ../../dpo.yaml
Comment thread
terrykong marked this conversation as resolved.
dpo:
max_num_epochs: 1
max_num_steps: 150
val_period: 50
val_batches: 16
val_global_batch_size: 32
val_micro_batch_size: 1
val_at_start: false
seed: 42

reference_policy_kl_penalty: 0.05
preference_average_log_probs: False
sft_average_log_probs: ${.preference_average_log_probs}
preference_loss_weight: 1
sft_loss_weight: 0.01

checkpointing:
enabled: true
checkpoint_dir: "results/dpo"
metric_name: "val_loss"
higher_is_better: false
keep_top_k: 3
save_period: 50
checkpoint_must_save_by: null

policy:
model_name: "meta-llama/Llama-3.1-8B-Instruct"
model_name: meta-llama/Llama-3.1-8B-Instruct
tokenizer:
name: ${policy.model_name}
train_global_batch_size: 256
train_micro_batch_size: 1
max_total_sequence_length: 8192
precision: "bfloat16"
dtensor_cfg:
enabled: true
cpu_offload: False
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 4
context_parallel_size: 1
custom_parallel_plan: null

dynamic_batching:
enabled: false

sequence_packing:
enabled: false

make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
max_grad_norm: 1.0

optimizer:
name: "torch.optim.AdamW"
kwargs:
lr: 5.0e-6
weight_decay: 0.1
betas: [0.9, 0.98]
eps: 1e-8
foreach: False
fused: False

eps: 1.0e-08
scheduler:
- name: "torch.optim.lr_scheduler.LinearLR"
kwargs:
start_factor: 0.000000001
end_factor: 1.0
total_iters: 1
- name: "torch.optim.lr_scheduler.ConstantLR"
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones: [1]

data:
dataset_name: "HelpSteer3"
max_input_seq_length: ${policy.max_total_sequence_length}
shuffle: true

- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 1.0e-09
end_factor: 1.0
total_iters: 1
- name: torch.optim.lr_scheduler.ConstantLR
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones:
- 1
logger:
log_dir: "logs"
wandb_enabled: true
tensorboard_enabled: true
mlflow_enabled: false
monitor_gpus: true
wandb:
project: nemo-rl
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4
tensorboard: {}
gpu_monitoring:
collection_interval: 10
flush_interval: 10

cluster:
gpus_per_node: 8
num_nodes: 4
Original file line number Diff line number Diff line change
@@ -1,128 +1,32 @@
defaults: ../../dpo.yaml
dpo:
max_num_epochs: 1
max_num_steps: 150
val_period: 50
val_batches: 16
val_global_batch_size: 32
val_micro_batch_size: 1
val_at_start: false
seed: 42

reference_policy_kl_penalty: 0.05
preference_average_log_probs: False
sft_average_log_probs: ${.preference_average_log_probs}
preference_loss_weight: 1
sft_loss_weight: 0.01

checkpointing:
enabled: false #true
checkpoint_dir: "results/dpo"
metric_name: "val_loss"
higher_is_better: false
keep_top_k: 3
save_period: 50
checkpoint_must_save_by: null

enabled: false
policy:
model_name: "meta-llama/Llama-3.1-8B-Instruct"
model_name: meta-llama/Llama-3.1-8B-Instruct
tokenizer:
name: ${policy.model_name}
train_global_batch_size: 256
train_micro_batch_size: 1
max_total_sequence_length: 8192
precision: "bfloat16"
dtensor_cfg:
enabled: false

dynamic_batching:
enabled: false

sequence_packing:
enabled: false

make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
max_grad_norm: 1.0

optimizer: null

megatron_cfg:
enabled: true
empty_unused_memory_level: 1
activation_checkpointing: false
tensor_model_parallel_size: 4
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
sequence_parallel: true
freeze_moe_router: false
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

optimizer:
optimizer: "adam"
lr: 5.0e-6 #4.0e-5
min_lr: 5.0e-6 #4.0e-5
weight_decay: 0.1
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.98
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

clip_grad: ${policy.max_grad_norm}

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_warmup_iters: 1
lr_warmup_init: 0.00000001

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
average_in_collective: true
data_parallel_sharding_strategy: "optim_grads_params"

data:
dataset_name: "HelpSteer3"
max_input_seq_length: ${policy.max_total_sequence_length}
shuffle: true

logger:
log_dir: "logs"
wandb_enabled: true
tensorboard_enabled: true
mlflow_enabled: false
monitor_gpus: true
wandb:
project: nemo-rl
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron
Comment thread
terrykong marked this conversation as resolved.
Outdated
tensorboard: {}
gpu_monitoring:
collection_interval: 10
flush_interval: 10

cluster:
gpus_per_node: 8
num_nodes: 4
Loading
Loading