Skip to content
270 changes: 270 additions & 0 deletions examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
grpo:
max_num_epochs: 1
num_prompts_per_step: 8
num_generations_per_prompt: 4
max_rollout_turns: 1 # for multi-turn rollouts. Workplace assistant has 1 turn but can have up to 6 tool-calling steps
max_num_steps: 72
normalize_rewards: true
use_leave_one_out_baseline: true
val_period: 6
val_at_start: true
overlong_filtering: false
max_val_samples: null # inferred from size of val dataset. for multi evals, repeat val ds via `num_repeats` in `ng_prepare_data`.
val_batch_size: null
seed: 42
use_dynamic_sampling: false
dynamic_sampling_max_gen_batches: 10
batch_multiplier: 1
reward_shaping:
enabled: false
overlong_buffer_length: 128
overlong_buffer_penalty: 1
max_response_length: ${policy.max_total_sequence_length}
reward_scaling:
enabled: false
source_min: 0.0
source_max: 1.0
target_min: 0.0
target_max: 1.0
skip_reference_policy_logprobs_calculation: true

loss_fn:
reference_policy_kl_penalty: 0
reference_policy_kl_type: "k3"
kl_input_clamp_value: 20.0
kl_output_clamp_value: 10.0
ratio_clip_min: 0.2
ratio_clip_max: 0.2
ratio_clip_c: null
# (default off) loss formulation improvements (docs/guides/grpo.md#loss)
use_on_policy_kl_approximation: false
truncated_importance_sampling_ratio: null
use_importance_sampling_correction: false
token_level_loss: true

checkpointing:
enabled: true
checkpoint_dir: "results/grpo-workplace-assistant-nemotron-nano-v2-9b"
metric_name: "val:accuracy"
higher_is_better: true
keep_top_k: 3
save_period: 6 # Save checkpoint every 6 steps (aligned with val_period)
checkpoint_must_save_by: null

policy:
model_name: "nvidia/NVIDIA-Nemotron-Nano-9B-v2"
tokenizer:
name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true
hf_config_overrides: {}
train_global_batch_size: ${mul:${grpo.num_prompts_per_step}, ${grpo.num_generations_per_prompt}} # Match the total rollouts per step
train_micro_batch_size: 1
logprob_batch_size: 1
generation_batch_size: 32 # Only used when generating using HF backend
max_total_sequence_length: 28672
precision: "bfloat16"
logprob_chunk_size: null # Disabled to allow defer_fp32_logits: false

dtensor_cfg:
_v2: false
enabled: false
cpu_offload: False
sequence_parallel: false
activation_checkpointing: true
tensor_parallel_size: 2
context_parallel_size: 1
custom_parallel_plan: null
clear_cache_every_n_steps: null

megatron_cfg:
enabled: true
bias_activation_fusion: false
tensor_model_parallel_size: 8
# We might want to consider setting this value higher (e.g. to 1) and raising the vllm generation max mem utilization
empty_unused_memory_level: 0
activation_checkpointing: true
# train_iters needs to be large enough to cover all training steps
train_iters: 100000
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
defer_fp32_logits: false
moe_permute_fusion: false

optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 5.0e-7
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

# optimizer cpu offload
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0

clip_grad: ${policy.max_grad_norm}

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 100000 # Must be greater than lr_warmup_iters
lr_warmup_iters: 13
lr_warmup_init: 5.0e-7
# override_opt_param_scheduler: true

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"

env_vars: null

# See docs/design-docs/sequence-packing-and-dynamic-batching.md
# for more details on dynamic batching and sequence packing.
dynamic_batching:
enabled: False
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
sequence_length_round: 64

sequence_packing:
enabled: false
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
algorithm: "modified_first_fit_decreasing"
sequence_length_round: 64

# makes the training sequence length divisible by the tensor parallel size
# this is useful for sequence parallel training
make_sequence_length_divisible_by: 1
max_grad_norm: 1.0
offload_optimizer_for_logprob: false # Only useful for non-colocated generation since colocated generation will always offload optimizer to cuda before refit

optimizer: null

scheduler:
- name: "torch.optim.lr_scheduler.ConstantLR"
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones: []

generation:
backend: "vllm"
max_new_tokens: ${policy.max_total_sequence_length}
temperature: 1.0
top_p: 1.0
top_k: null
stop_token_ids: null
stop_strings: null
vllm_cfg:
async_engine: true
precision: ${policy.precision}
# Must match megatron_cfg.tensor_model_parallel_size
tensor_parallel_size: 8
pipeline_parallel_size: 1
enable_expert_parallel: false
expert_parallel_size: 1
gpu_memory_utilization: 0.7
max_model_len: ${policy.max_total_sequence_length}
enforce_eager: false
use_deep_gemm: False
num_last_layers_in_bf16: 0
num_first_layers_in_bf16: 0
kv_cache_dtype: "auto"
expose_http_server: true
skip_tokenizer_init: false
http_server_serving_chat_kwargs:
# Workplace assistant uses 26 tools, so we enable auto_tools.
# For Nemotron Nano v2, we use the dedicated `nemotron_json` tool parser,
# registered via `nemotron_toolcall_parser_no_streaming.py`.
enable_auto_tools: true
tool_parser: nemotron_json
vllm_kwargs:
compilation_config:
# when enforce_eager is False, set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy,
# with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
# for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
use_inductor: False
mamba_ssm_cache_dtype: "float32"
colocated:
# true: generation shares training GPUs
# false: uses dedicated generation resources
enabled: true
# only relevant when enabled is false
resources:
gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
num_nodes: null # Decides number of nodes to be dedicated to generation

data:
# Using the prepared train and validation datasets (downloaded from HuggingFace and split 90/10)
# Train: 1129 samples, Validation: 126 samples
train_jsonl_fpath: 3rdparty/Gym-workspace/Gym/resources_servers/workplace_assistant/data/train.jsonl
validation_jsonl_fpath: 3rdparty/Gym-workspace/Gym/resources_servers/workplace_assistant/data/validation.jsonl
agent_name: workplace_assistant_simple_agent
shuffle: true
num_workers: 0

env:
should_use_nemo_gym: true
should_log_nemo_gym_responses: true # If you have low logging storage, set this to false
nemo_gym: # This is passed into NeMo-Gym as the initial_global_config_dict
config_paths:
- responses_api_models/vllm_model/configs/vllm_model_for_training.yaml # Required! And it must be *for_training
- resources_servers/workplace_assistant/configs/workplace_assistant.yaml
workplace_assistant_simple_agent:
responses_api_agents:
simple_agent:
max_steps: 6 # Workplace assistant allows up to 6 tool-calling steps per task

logger:
log_dir: "logs/grpo-workplace-assistant-nemotron-nano-v2-9b" # Base directory for all logs
num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
wandb_enabled: true
tensorboard_enabled: false
mlflow_enabled: false # Disable MLflow logging
swanlab_enabled: false
monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard
wandb:
project: "grpo-workplace-assistant"
name: "nemotron-nano-v2-9b-workplace-assistant"
tensorboard: {}
mlflow:
experiment_name: "grpo-workplace-assistant"
run_name: "nemotron-nano-v2-9b-workplace-assistant"
gpu_monitoring:
collection_interval: 10 # How often to collect GPU usage metrics (in seconds)
flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds)

cluster:
gpus_per_node: 8
num_nodes: 1 # Single node by default; set to 2+ for multi-node training
Loading
Loading