From 99d58473487e5468a96c4c6fc60e7214e67ce0fe Mon Sep 17 00:00:00 2001 From: ashors1 Date: Tue, 12 Aug 2025 09:27:15 -0700 Subject: [PATCH 01/12] add Megatron tests, improve some existing tests Signed-off-by: ashors1 --- ...o-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml} | 8 +- ...lama3.1-8b-instruct-4n8g-megatron.v2.yaml} | 8 +- ...po-llama3.2-1b-instruct-1n8g-megatron.yaml | 161 ++++++++++++ .../grpo-moonlight-16ba3b-4n8g-megatron.yaml | 169 ++++++++++++ ...rpo-qwen2.5-7b-instruct-4n8g-megatron.yaml | 183 +++++++++++++ .../llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml | 156 +++++++++++ ...lama3.1-70b-8n8g-tp4pp2-long-megatron.yaml | 135 ++++++++++ ...lama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml | 82 ++++++ ...> sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml} | 41 +-- ...l => sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml} | 31 ++- ...sft-llama3.1-8b-1n8g-megatron-seqpack.yaml | 127 +++++++++ ...aml => sft-llama3.1-8b-1n8g-megatron.yaml} | 41 +-- ... => sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml} | 14 +- ...en2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml} | 13 +- examples/configs/sft_openmathinstruct2.yaml | 6 + tests/check_metrics.py | 29 ++- tests/functional/dpo_megatron.sh | 45 ++++ tests/functional/sft_megatron.sh | 45 ++++ ...a3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh | 3 +- ...dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh} | 3 +- ...-llama3.1-8b-instruct-4n8g-megatron.v2.sh} | 3 +- ...1-8b-instruct-4n8g-megatrontp2pp2-quick.sh | 3 +- ...o-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh | 3 +- .../llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh | 3 +- ...o-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh | 3 +- ...grpo-llama3.2-1b-instruct-1n8g-megatron.sh | 42 +++ .../grpo-moonlight-16ba3b-4n8g-megatron.sh | 41 +++ .../grpo-qwen2.5-7b-instruct-4n8g-megatron.sh | 41 +++ ...2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh | 3 +- .../llm/grpo-qwen3-30ba3b-8n8g-megatron.sh | 40 +++ ...-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh | 42 +++ ...llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh} | 13 +- .../llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh | 42 +++ ....sh => sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh} | 12 +- ... sft-llama3.1-8b-1n8g-megatron-seqpack.sh} | 8 +- .../llm/sft-llama3.1-8b-1n8g-megatron.sh | 39 +++ ...sh => sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh} | 11 +- ...qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh} | 2 +- tests/test_suites/nightly.txt | 20 +- tests/test_suites/release.txt | 12 +- .../models/megatron/converters/test_common.py | 245 ++++++++++++++++++ 41 files changed, 1818 insertions(+), 110 deletions(-) rename examples/configs/recipes/llm/{dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.yaml => dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml} (93%) rename examples/configs/recipes/llm/{dpo-llama3.1-8b-instruct-4n8g-megatron.yaml => dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml} (95%) create mode 100755 examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml create mode 100644 examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml create mode 100755 examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml create mode 100755 examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml create mode 100644 examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml create mode 100644 examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml rename examples/configs/recipes/llm/{sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.yaml => sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml} (66%) rename examples/configs/recipes/llm/{sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.yaml => sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml} (69%) create mode 100644 examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml rename examples/configs/recipes/llm/{sft-llama3.1-8b-instruct-1n8g-megatron.yaml => sft-llama3.1-8b-1n8g-megatron.yaml} (74%) rename examples/configs/recipes/llm/{sft-llama3.2-1b-1n8g-fsdp2tp1.v2.yaml => sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml} (87%) rename examples/configs/recipes/llm/{sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.yaml => sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml} (88%) create mode 100755 tests/functional/dpo_megatron.sh create mode 100755 tests/functional/sft_megatron.sh rename tests/test_suites/llm/{dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.sh => dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh} (91%) rename tests/test_suites/llm/{dpo-llama3.1-8b-instruct-4n8g-megatron.sh => dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh} (91%) create mode 100755 tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh create mode 100755 tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh create mode 100755 tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh create mode 100755 tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh create mode 100755 tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh rename tests/test_suites/llm/{sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh => sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh} (81%) create mode 100755 tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh rename tests/test_suites/llm/{sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh => sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh} (83%) rename tests/test_suites/llm/{sft-llama3.1-8b-instruct-1n8g-megatron.sh => sft-llama3.1-8b-1n8g-megatron-seqpack.sh} (87%) create mode 100755 tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh rename tests/test_suites/llm/{sft-llama3.2-1b-1n8g-fsdp2tp1.v2.sh => sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh} (82%) rename tests/test_suites/llm/{sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh => sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh} (96%) create mode 100755 tests/unit/models/megatron/converters/test_common.py diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml similarity index 93% rename from examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.yaml rename to examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml index 9cd7573ccf..01395f5247 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml @@ -20,7 +20,7 @@ checkpointing: metric_name: "val_loss" higher_is_better: false keep_top_k: 3 - save_period: 10000 + save_period: 50 checkpoint_must_save_by: null policy: @@ -29,14 +29,14 @@ policy: name: ${policy.model_name} train_global_batch_size: 256 train_micro_batch_size: 1 - max_total_sequence_length: 2048 + max_total_sequence_length: 8192 precision: "bfloat16" dtensor_cfg: enabled: true cpu_offload: False sequence_parallel: false activation_checkpointing: false - tensor_parallel_size: 1 + tensor_parallel_size: 4 context_parallel_size: 1 custom_parallel_plan: null @@ -85,7 +85,7 @@ logger: num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl - name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1 + name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4 tensorboard: {} gpu_monitoring: collection_interval: 10 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml similarity index 95% rename from examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml rename to examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml index 55b473b4b6..49e8a78422 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml @@ -20,7 +20,7 @@ checkpointing: metric_name: "val_loss" higher_is_better: false keep_top_k: 3 - save_period: 10000 + save_period: 50 checkpoint_must_save_by: null policy: @@ -29,7 +29,7 @@ policy: name: ${policy.model_name} train_global_batch_size: 256 train_micro_batch_size: 1 - max_total_sequence_length: 2048 + max_total_sequence_length: 8192 precision: "bfloat16" dtensor_cfg: enabled: false @@ -49,7 +49,7 @@ policy: enabled: true empty_unused_memory_level: 1 activation_checkpointing: false - tensor_model_parallel_size: 2 + tensor_model_parallel_size: 4 expert_tensor_parallel_size: 1 expert_model_parallel_size: 1 pipeline_model_parallel_size: 1 @@ -118,7 +118,7 @@ logger: num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl - name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1 + name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron tensorboard: {} gpu_monitoring: collection_interval: 10 diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml new file mode 100755 index 0000000000..55de4c59f4 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml @@ -0,0 +1,161 @@ +grpo: + num_prompts_per_step: 32 + num_generations_per_prompt: 16 + max_rollout_turns: 1 + max_num_steps: 500 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 + seed: 42 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_clip_min: 0.2 + ratio_clip_max: 0.2 + ratio_clip_c: null + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false + token_level_loss: true +checkpointing: + enabled: false + checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-megatron + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 100 + checkpoint_must_save_by: null +policy: + model_name: meta-llama/Llama-3.2-1B-Instruct + tokenizer: + name: meta-llama/Llama-3.2-1B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 4 + generation_batch_size: 32 + logprob_batch_size: 4 + max_total_sequence_length: 512 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + optimizer: null + megatron_cfg: + enabled: true + empty_unused_memory_level: 0 + activation_checkpointing: false + tensor_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + context_parallel_size: 1 + pipeline_dtype: ${policy.precision} + sequence_parallel: false + freeze_moe_router: true + moe_router_dtype: "fp64" + moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo + moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + + optimizer: + optimizer: "adam" + lr: 5.0e-6 + min_lr: 5.0e-7 + weight_decay: 0.01 + bf16: true + fp16: false + params_dtype: "float32" + + #adam + adam_beta1: 0.9 + adam_beta2: 0.999 + adam_eps: 1e-8 + + #sgd + sgd_momentum: 0.9 + + #distributed optimizer + use_distributed_optimizer: true + use_precision_aware_optimizer: true + + clip_grad: ${policy.max_grad_norm} + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "constant" + lr_decay_iters: null + lr_warmup_iters: 50 + lr_warmup_init: 5.0e-7 + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + average_in_collective: true + use_custom_fsdp: false + data_parallel_sharding_strategy: "optim_grads_params" + + dtensor_cfg: + enabled: false + dynamic_batching: + enabled: False + sequence_packing: + enabled: True + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + generation: + backend: vllm + max_new_tokens: 512 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 128009 + stop_strings: null + vllm_cfg: + async_engine: false + precision: ${policy.precision} + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + gpu_memory_utilization: 0.6 + max_model_len: 512 + enforce_eager: False + colocated: + enabled: true + resources: + gpus_per_node: null + num_nodes: null +data: + max_input_seq_length: 512 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 + shuffle: true +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-megatron + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + mlflow_enabled: False + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-llama3.2-1b-instruct-1n8g-megatron + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml new file mode 100644 index 0000000000..42b365f351 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml @@ -0,0 +1,169 @@ +# GRPO Algorithm Configuration +defaults: "../../grpo_math_1B.yaml" + +grpo: + num_prompts_per_step: 32 + num_generations_per_prompt: 16 + max_num_steps: 1000000 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: -1 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 + +loss_fn: + reference_policy_kl_penalty: 0.04 + ratio_clip_min: 0.2 + ratio_clip_max: 0.2 + # (default off) loss formulation improvements (docs/guides/grpo.md#loss) + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false + token_level_loss: true + ratio_clip_c: null + +checkpointing: + enabled: false + checkpoint_dir: "results/grpo_megatron" + metric_name: "val_reward" + higher_is_better: true + keep_top_k: 3 + save_period: 10000 + +policy: + model_name: "moonshotai/Moonlight-16B-A3B-Instruct" + tokenizer: + name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 64 # Only used when generating using megatron backend + logprob_batch_size: 1 + max_total_sequence_length: 8192 + precision: "bfloat16" + + dtensor_cfg: + enabled: false + + # dynamic_batching improves performance by ensuring logprob and training microbatches + # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length + # responses are sorted by sequence length and bucketed into microbatches with a total + # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the + # training and logprob stages respectively. + dynamic_batching: + enabled: False + + sequence_packing: + enabled: False # coming soon + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + algorithm: "modified_ffd" + sequence_length_round: 64 + + max_grad_norm: 1.0 + # makes the training sequence length divisible by the tensor parallel size + # this is useful for sequence parallel training + make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + + optimizer: null # remove default FSDP optimizer + + megatron_cfg: + enabled: true + empty_unused_memory_level: 0 + activation_checkpointing: false + converter_type: "Qwen2ForCausalLM" + tensor_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 4 + pipeline_model_parallel_size: 4 + num_layers_in_first_pipeline_stage: 7 + num_layers_in_last_pipeline_stage: 6 + context_parallel_size: 1 + pipeline_dtype: ${policy.precision} + sequence_parallel: false + freeze_moe_router: true + moe_router_dtype: "fp64" + moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo + moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + #gives ~20% training perf speedup with sequence packing + # Causes logprob error divergence for moonlight + apply_rope_fusion: False + + optimizer: + optimizer: "adam" + lr: 1.0e-6 + min_lr: 5.0e-7 + weight_decay: 0.01 + bf16: true + fp16: false + params_dtype: "float32" + + #adam + adam_beta1: 0.9 + adam_beta2: 0.999 + adam_eps: 1e-8 + + #sgd + sgd_momentum: 0.9 + + #distributed optimizer + use_distributed_optimizer: true + use_precision_aware_optimizer: true + + clip_grad: ${policy.max_grad_norm} + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "constant" + lr_decay_iters: null + lr_warmup_iters: 50 + lr_warmup_init: 5.0e-7 + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + average_in_collective: true + use_custom_fsdp: false + data_parallel_sharding_strategy: "optim_grads_params" + + generation: + backend: "vllm" + max_new_tokens: ${policy.max_total_sequence_length} + temperature: 1.0 + top_p: 1.0 + top_k: null + vllm_cfg: + tensor_parallel_size: 1 + gpu_memory_utilization: 0.6 + max_model_len: ${policy.max_total_sequence_length} + +data: + max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len + prompt_file: "examples/prompts/cot.txt" + system_prompt_file: null + dataset_name: "OpenMathInstruct-2" + +env: + math: + num_workers: 8 + +logger: + log_dir: "logs" # Base directory for all logs + num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal + wandb_enabled: false + tensorboard_enabled: false + mlflow_enabled: False + monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard + wandb: + project: "grpo-dev" + name: "grpo-moonlight-16B-A3B-Instruct" + tensorboard: {} + gpu_monitoring: + collection_interval: 10 # How often to collect GPU usage metrics (in seconds) + flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) + +cluster: + gpus_per_node: 8 + num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml new file mode 100755 index 0000000000..d099931839 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml @@ -0,0 +1,183 @@ +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_rollout_turns: 1 + max_num_steps: 30 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 + seed: 42 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_clip_min: 0.2 + ratio_clip_max: 0.2 + ratio_clip_c: null + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false + token_level_loss: true +checkpointing: + enabled: false + checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-megatron + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 100 + checkpoint_must_save_by: null +policy: + model_name: Qwen/Qwen2.5-7B-Instruct + tokenizer: + name: ${policy.model_name} + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 + logprob_batch_size: 2 + max_total_sequence_length: 4096 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: false + megatron_cfg: + enabled: true + empty_unused_memory_level: 0 + activation_checkpointing: false + converter_type: "Qwen2ForCausalLM" + tensor_model_parallel_size: 2 + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + context_parallel_size: 1 + pipeline_dtype: ${policy.precision} + sequence_parallel: false + freeze_moe_router: true + moe_router_dtype: "fp64" + moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo + moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + + optimizer: + optimizer: "adam" + lr: 5.0e-6 + min_lr: 5.0e-7 + weight_decay: 0.01 + bf16: true + fp16: false + params_dtype: "float32" + + #adam + adam_beta1: 0.9 + adam_beta2: 0.999 + adam_eps: 1e-8 + + #sgd + sgd_momentum: 0.9 + + #distributed optimizer + use_distributed_optimizer: true + use_precision_aware_optimizer: true + + clip_grad: ${policy.max_grad_norm} + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "constant" + lr_decay_iters: null + lr_warmup_iters: 50 + lr_warmup_init: 5.0e-7 + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + average_in_collective: true + use_custom_fsdp: false + data_parallel_sharding_strategy: "optim_grads_params" + dynamic_batching: + enabled: false + sequence_packing: + enabled: true + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + make_sequence_length_divisible_by: 4 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 3e-07 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 13 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 13 + generation: + backend: vllm + max_new_tokens: 4096 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 151645 + stop_strings: null + vllm_cfg: + async_engine: false + precision: ${policy.precision} + tensor_parallel_size: 4 + pipeline_parallel_size: 1 + gpu_memory_utilization: 0.6 + max_model_len: 4096 + enforce_eager: False + colocated: + enabled: true + resources: + gpus_per_node: null + num_nodes: null +data: + max_input_seq_length: 4096 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 + shuffle: true +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-megatron + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + mlflow_enabled: False + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-7b-instruct-4n8g-megatron + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml new file mode 100755 index 0000000000..89e434d3dc --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml @@ -0,0 +1,156 @@ +# GRPO Algorithm Configuration +defaults: "../../grpo_math_1B.yaml" + +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_num_steps: 1000000 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_clip_min: 0.2 + ratio_clip_max: 0.2 + # (default off) loss formulation improvements (docs/guides/grpo.md#loss) + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false + token_level_loss: true + ratio_clip_c: null +checkpointing: + enabled: false + checkpoint_dir: results/grpo-qwen3-30ba3b-8n8g-megatron + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 + checkpoint_must_save_by: null +policy: + model_name: "Qwen/Qwen3-30B-A3B" + tokenizer: + name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 # Only used when generating using HF backend + logprob_batch_size: 4 + max_total_sequence_length: 4096 + precision: "bfloat16" + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + + dtensor_cfg: + enabled: false + + optimizer: null # remove default FSDP optimizer + + scheduler: null # remove default FSDP scheduler + + dynamic_batching: + enabled: False + sequence_packing: + enabled: False # coming soon + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + algorithm: "modified_ffd" + sequence_length_round: 64 + max_grad_norm: 1.0 + make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_cfg: + enabled: true + empty_unused_memory_level: 1 + activation_checkpointing: false + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 4 + context_parallel_size: 1 + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 4 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + sequence_parallel: True + pipeline_dtype: ${policy.precision} + freeze_moe_router: true + moe_router_dtype: "fp64" + moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo + moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + + optimizer: + optimizer: "adam" + lr: 3.0e-7 + min_lr: 3.0e-8 + weight_decay: 0.01 + bf16: true + fp16: false + params_dtype: "float32" + clip_grad: ${policy.max_grad_norm} + #adam + adam_beta1: 0.9 + adam_beta2: 0.999 + adam_eps: 1e-8 + #sgd + sgd_momentum: 0.9 + #distributed optimizer + use_distributed_optimizer: true + use_precision_aware_optimizer: true + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "constant" + lr_decay_iters: null + lr_warmup_iters: 50 + lr_warmup_init: 3.0e-8 + + env_vars: + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False" + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + average_in_collective: true + use_custom_fsdp: false + data_parallel_sharding_strategy: "optim_grads_params" + + generation: + backend: "vllm" + max_new_tokens: ${policy.max_total_sequence_length} + temperature: 1.0 + top_p: 1.0 + top_k: null + stop_token_ids: null + stop_strings: null + vllm_cfg: + tensor_parallel_size: 4 + gpu_memory_utilization: 0.7 + max_model_len: ${policy.max_total_sequence_length} +data: + max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len + prompt_file: "examples/prompts/cot.txt" + system_prompt_file: null + dataset_name: "OpenMathInstruct-2" +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen3-30ba3b-8n8g-megatron + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + mlflow_enabled: False + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen3-30ba3b-8n8g-megatron + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 8 diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml new file mode 100644 index 0000000000..08b015512a --- /dev/null +++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml @@ -0,0 +1,135 @@ +sft: + max_num_epochs: 1 + max_num_steps: 1000000 + val_period: 500 + val_batches: 4 + val_global_batch_size: 128 + val_micro_batch_size: 1 + val_at_start: false + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 100 + checkpoint_must_save_by: null +policy: + model_name: "meta-llama/Llama-3.1-70B" + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct ## specify if you'd like to use a tokenizer different from the model's default + train_global_batch_size: 512 + train_micro_batch_size: 1 + max_total_sequence_length: 4096 + precision: "bfloat16" + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: false + megatron_cfg: + enabled: true + empty_unused_memory_level: 1 + activation_checkpointing: false + tensor_model_parallel_size: 4 + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_model_parallel_size: 2 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + context_parallel_size: 1 + pipeline_dtype: ${policy.precision} + sequence_parallel: false + freeze_moe_router: true + moe_router_dtype: "fp64" + moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo + moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + + optimizer: + optimizer: "adam" + lr: 2e-5 + min_lr: 2e-5 + weight_decay: 0.01 + bf16: true + fp16: false + params_dtype: "float32" + + #adam + adam_beta1: 0.9 + adam_beta2: 0.999 + adam_eps: 1e-8 + + #sgd + sgd_momentum: 0.9 + + #distributed optimizer + use_distributed_optimizer: true + use_precision_aware_optimizer: true + + clip_grad: 0.0 + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "constant" + lr_decay_iters: null + lr_warmup_iters: 1 + lr_warmup_init: 2e-5 + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + average_in_collective: true + use_custom_fsdp: false + data_parallel_sharding_strategy: "optim_grads_params" + dynamic_batching: + enabled: false + sequence_packing: + enabled: false + # makes the training sequence length divisible by the tensor parallel size + # this is useful for sequence parallel training + make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + max_grad_norm: null + optimizer: + name: "torch.optim.AdamW" + kwargs: + lr: 2e-5 + weight_decay: 0.01 + betas: [0.9, 0.98] + eps: 1e-8 + # when using Dtensor, we need to set foreach + # and fused to False + foreach: False + fused: False +data: + max_input_seq_length: ${policy.max_total_sequence_length} + dataset_name: "openmathinstruct2" + prompt_file: examples/prompts/math.txt + split: "train_1M" + add_bos: true + add_eos: true + add_generation_prompt: true + output_key: 'generated_solution' + seed: 42 + shuffle: true +logger: + log_dir: "logs" # Base directory for all logs + wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running + tensorboard_enabled: true + mlflow_enabled: False + monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard + wandb: + project: "sft-dev" + name: "openmathinstruct-nemorl-1M_train" + tensorboard: + log_dir: "tb_logs-openmathinstruct-nemorl-1M_train" + gpu_monitoring: + collection_interval: 10 # How often to collect GPU usage metrics (in seconds) + flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) +cluster: + gpus_per_node: 8 + num_nodes: 8 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml new file mode 100644 index 0000000000..b04201ac9f --- /dev/null +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml @@ -0,0 +1,82 @@ +sft: + max_num_epochs: 1 + max_num_steps: 10000 + val_period: 500 + val_batches: 4 + val_global_batch_size: 128 + val_micro_batch_size: 2 + val_at_start: false + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 50 + checkpoint_must_save_by: null +policy: + model_name: meta-llama/Llama-3.1-8B + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 2 + max_total_sequence_length: 4096 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 4 + context_parallel_size: 1 + custom_parallel_plan: null + dynamic_batching: + enabled: true + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + sequence_length_round: 64 + sequence_packing: + enabled: false + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 2e-5 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + eps: 1e-08 + foreach: false + fused: false +data: + max_input_seq_length: ${policy.max_total_sequence_length} + dataset_name: "openmathinstruct2" + prompt_file: examples/prompts/math.txt + split: "train_1M" + add_bos: true + add_eos: true + add_generation_prompt: true + output_key: 'generated_solution' + seed: 42 + shuffle: true +logger: + log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long + wandb_enabled: true + tensorboard_enabled: true + mlflow_enabled: false + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long + tensorboard: + log_dir: tb_logs-sft-dev-squad + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml similarity index 66% rename from examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.yaml rename to examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml index 409c21670b..20adece59d 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml @@ -1,11 +1,11 @@ sft: max_num_epochs: 1 - max_num_steps: 2730 - val_period: 10 - val_batches: 8 - val_global_batch_size: 32 - val_micro_batch_size: 1 - val_at_start: true + max_num_steps: 10000 + val_period: 500 + val_batches: 4 + val_global_batch_size: 128 + val_micro_batch_size: 2 + val_at_start: false seed: 42 checkpointing: enabled: true @@ -13,16 +13,15 @@ checkpointing: metric_name: val_loss higher_is_better: false keep_top_k: 3 - save_period: 10 + save_period: 100 checkpoint_must_save_by: null policy: - model_name: meta-llama/Llama-3.1-8B-Instruct + model_name: meta-llama/Llama-3.1-8B tokenizer: name: meta-llama/Llama-3.1-8B-Instruct - chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' - train_global_batch_size: 32 - train_micro_batch_size: 1 - max_total_sequence_length: 1024 + train_global_batch_size: 512 + train_micro_batch_size: 2 + max_total_sequence_length: 4096 precision: bfloat16 dtensor_cfg: enabled: true @@ -34,6 +33,8 @@ policy: custom_parallel_plan: null dynamic_batching: enabled: false + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + sequence_length_round: 64 sequence_packing: enabled: false train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} @@ -44,21 +45,25 @@ policy: optimizer: name: torch.optim.AdamW kwargs: - lr: 5e-06 - weight_decay: 0.1 + lr: 2e-5 + weight_decay: 0.01 betas: - 0.9 - 0.98 - eps: 1e-05 + eps: 1e-08 foreach: false fused: false data: - max_input_seq_length: 1024 - dataset_name: squad + max_input_seq_length: ${policy.max_total_sequence_length} + dataset_name: "openmathinstruct2" + prompt_file: examples/prompts/math.txt + split: "train_1M" add_bos: true add_eos: true - add_generation_prompt: false + add_generation_prompt: true + output_key: 'generated_solution' shuffle: true + seed: 42 logger: log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long wandb_enabled: true diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml similarity index 69% rename from examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.yaml rename to examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml index 1ac548c354..e9abb0771f 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml @@ -1,7 +1,7 @@ sft: max_num_epochs: 1 max_num_steps: 350 - val_period: 10 + val_period: 500 val_batches: 8 val_global_batch_size: 32 val_micro_batch_size: 1 @@ -13,16 +13,15 @@ checkpointing: metric_name: val_loss higher_is_better: false keep_top_k: 3 - save_period: 10 + save_period: 20 checkpoint_must_save_by: null policy: - model_name: meta-llama/Llama-3.1-8B-Instruct + model_name: meta-llama/Llama-3.1-8B tokenizer: name: meta-llama/Llama-3.1-8B-Instruct - chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' - train_global_batch_size: 32 - train_micro_batch_size: 1 - max_total_sequence_length: 1024 + train_global_batch_size: 512 + train_micro_batch_size: 2 + max_total_sequence_length: 4096 precision: bfloat16 dtensor_cfg: enabled: true @@ -44,21 +43,25 @@ policy: optimizer: name: torch.optim.AdamW kwargs: - lr: 5e-06 - weight_decay: 0.1 + lr: 2e-5 + weight_decay: 0.01 betas: - 0.9 - 0.98 - eps: 1e-05 + eps: 1e-08 foreach: false fused: false data: - max_input_seq_length: 1024 - dataset_name: squad + max_input_seq_length: ${policy.max_total_sequence_length} + dataset_name: "openmathinstruct2" + prompt_file: examples/prompts/math.txt + split: "train_1M" add_bos: true add_eos: true - add_generation_prompt: false + add_generation_prompt: true + output_key: 'generated_solution' shuffle: true + seed: 42 logger: log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp wandb_enabled: true @@ -70,7 +73,7 @@ logger: project: nemo-rl name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp tensorboard: - log_dir: tb_logs-sft-dev-squad + log_dir: tb_logs-sft-dev-openmathinstruct2 gpu_monitoring: collection_interval: 10 flush_interval: 10 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml new file mode 100644 index 0000000000..c2791b3074 --- /dev/null +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml @@ -0,0 +1,127 @@ +sft: + max_num_epochs: 1 + max_num_steps: 250 + val_period: 500 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-megatron + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 50 + checkpoint_must_save_by: null +policy: + model_name: meta-llama/Llama-3.1-8B + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 2 + max_total_sequence_length: 4096 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: false + dynamic_batching: + enabled: false + sequence_packing: + enabled: true + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + max_grad_norm: 1 + optimizer: null + megatron_cfg: + enabled: true + empty_unused_memory_level: 1 + activation_checkpointing: false + tensor_model_parallel_size: 2 + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_model_parallel_size: 2 + context_parallel_size: 1 + pipeline_dtype: ${policy.precision} + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + sequence_parallel: false + freeze_moe_router: false + moe_router_dtype: null + moe_router_load_balancing_type: "aux_loss" + moe_router_bias_update_rate: 1e-3 + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + + optimizer: + optimizer: "adam" + lr: 2.0e-5 + min_lr: 1.99999e-5 + weight_decay: 0.01 + bf16: true + fp16: false + params_dtype: "float32" + + #adam + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1e-5 + + #sgd + sgd_momentum: 0.9 + + #distributed optimizer + use_distributed_optimizer: true + use_precision_aware_optimizer: true + + clip_grad: ${policy.max_grad_norm} + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "constant" + lr_decay_iters: null + lr_warmup_iters: 50 + lr_warmup_init: 1.9999e-65 + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + average_in_collective: true + data_parallel_sharding_strategy: "optim_grads_params" + + +data: + max_input_seq_length: ${policy.max_total_sequence_length} + dataset_name: "openmathinstruct2" + prompt_file: examples/prompts/math.txt + split: "train_1M" + add_bos: true + add_eos: true + add_generation_prompt: true + output_key: 'generated_solution' + seed: 42 + shuffle: true +logger: + log_dir: logs/sft-llama3.1-8b-1n8g-megatron + wandb_enabled: true + tensorboard_enabled: true + mlflow_enabled: false + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-llama3.1-8b-1n8g-megatron + tensorboard: + log_dir: tb_logs-sft-dev-openmathinstruct2 + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml similarity index 74% rename from examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml rename to examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml index 5e7bc8b8d7..28331f4c69 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml @@ -1,28 +1,27 @@ sft: max_num_epochs: 1 max_num_steps: 250 - val_period: 10 + val_period: 500 val_batches: 8 val_global_batch_size: 32 val_micro_batch_size: 1 val_at_start: true seed: 42 checkpointing: - enabled: false #true - checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp1 + enabled: true + checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-megatron metric_name: val_loss higher_is_better: false keep_top_k: 3 - save_period: 10 + save_period: 100 checkpoint_must_save_by: null policy: - model_name: meta-llama/Llama-3.1-8B-Instruct + model_name: meta-llama/Llama-3.1-8B tokenizer: name: meta-llama/Llama-3.1-8B-Instruct - chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' - train_global_batch_size: 32 + train_global_batch_size: 512 train_micro_batch_size: 2 - max_total_sequence_length: 1024 + max_total_sequence_length: 4096 precision: bfloat16 dtensor_cfg: enabled: false @@ -58,10 +57,10 @@ policy: optimizer: optimizer: "adam" - lr: 5.0e-6 - min_lr: 4.9999e-6 - weight_decay: 0.1 - bf16: false + lr: 2.0e-5 + min_lr: 1.99999e-5 + weight_decay: 0.01 + bf16: true fp16: false params_dtype: "float32" @@ -86,7 +85,7 @@ policy: lr_decay_style: "constant" lr_decay_iters: null lr_warmup_iters: 50 - lr_warmup_init: 4.9999e-6 + lr_warmup_init: 1.9999e-65 distributed_data_parallel_config: grad_reduce_in_fp32: false @@ -97,14 +96,18 @@ policy: data: - add_generation_prompt: false - max_input_seq_length: 1024 - dataset_name: squad + max_input_seq_length: ${policy.max_total_sequence_length} + dataset_name: "openmathinstruct2" + prompt_file: examples/prompts/math.txt + split: "train_1M" add_bos: true add_eos: true + add_generation_prompt: true + output_key: 'generated_solution' shuffle: true + seed: 42 logger: - log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp1 + log_dir: logs/sft-llama3.1-8b-1n8g-megatron wandb_enabled: true tensorboard_enabled: true mlflow_enabled: false @@ -112,9 +115,9 @@ logger: num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl - name: sft-llama3.1-8b-instruct-1n8g-fsdp1 + name: sft-llama3.1-8b-1n8g-megatron tensorboard: - log_dir: tb_logs-sft-dev-squad + log_dir: tb_logs-sft-dev-openmathinstruct2 gpu_monitoring: collection_interval: 10 flush_interval: 10 diff --git a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.yaml b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml similarity index 87% rename from examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.yaml rename to examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml index 6a5eb97f6f..de2e6f9eee 100644 --- a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.yaml +++ b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml @@ -13,7 +13,7 @@ checkpointing: metric_name: val_loss higher_is_better: false keep_top_k: 3 - save_period: 10 + save_period: 100 checkpoint_must_save_by: null policy: model_name: meta-llama/Llama-3.2-1B @@ -53,12 +53,16 @@ policy: foreach: false fused: false data: - max_input_seq_length: 1024 - dataset_name: squad + max_input_seq_length: ${policy.max_total_sequence_length} + dataset_name: "openmathinstruct2" + prompt_file: examples/prompts/math.txt + split: "train_1M" add_bos: true add_eos: true - add_generation_prompt: false + add_generation_prompt: true + output_key: 'generated_solution' shuffle: true + seed: 42 logger: log_dir: logs/sft-llama3.2-1b-1n8g-fsdp2tp1 wandb_enabled: true @@ -70,7 +74,7 @@ logger: project: nemo-rl name: sft-llama3.2-1b-1n8g-fsdp2tp1 tensorboard: - log_dir: tb_logs-sft-dev-squad + log_dir: tb_logs-sft-dev-openmathinstruct2 gpu_monitoring: collection_interval: 10 flush_interval: 10 diff --git a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.yaml b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml similarity index 88% rename from examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.yaml rename to examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml index 516cda4bbb..85f4e7f111 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml @@ -13,7 +13,7 @@ checkpointing: metric_name: val_loss higher_is_better: false keep_top_k: 3 - save_period: 10 + save_period: 100 checkpoint_must_save_by: null policy: model_name: Qwen/Qwen2.5-32B @@ -53,11 +53,14 @@ policy: foreach: false fused: false data: - max_input_seq_length: 16000 - dataset_name: squad + max_input_seq_length: ${policy.max_total_sequence_length} + dataset_name: "openmathinstruct2" + prompt_file: examples/prompts/math.txt + split: "train_1M" add_bos: true add_eos: true - add_generation_prompt: false + add_generation_prompt: true + output_key: 'generated_solution' shuffle: true logger: log_dir: logs/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt @@ -70,7 +73,7 @@ logger: project: nemo-rl name: sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt tensorboard: - log_dir: tb_logs-sft-dev-squad + log_dir: tb_logs-sft-dev-openmathinstruct2 gpu_monitoring: collection_interval: 10 flush_interval: 10 diff --git a/examples/configs/sft_openmathinstruct2.yaml b/examples/configs/sft_openmathinstruct2.yaml index 640911bc0f..bbea9f1767 100644 --- a/examples/configs/sft_openmathinstruct2.yaml +++ b/examples/configs/sft_openmathinstruct2.yaml @@ -36,11 +36,17 @@ policy: context_parallel_size: 1 custom_parallel_plan: null + megatron_cfg: + enabled: false + dynamic_batching: enabled: false sequence_packing: enabled: false + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 # makes the training sequence length divisible by the tensor parallel size # this is useful for sequence parallel training diff --git a/tests/check_metrics.py b/tests/check_metrics.py index a48c2f4875..df97d22a1f 100644 --- a/tests/check_metrics.py +++ b/tests/check_metrics.py @@ -31,9 +31,32 @@ def max(value): return __builtins__.max(float(v) for v in value.values()) -def mean(value): - """Return the mean of values in a dictionary.""" - return statistics.mean(float(v) for v in value.values()) +def mean(value, range_start=1, range_end=0): + """Return the mean of values (or a range of values) in a dictionary. + + Note: + step, and ranges, are 1 indexed. Range_end is exclusive. + range_end=0 means to include until the last step in the run + """ + + ## find potential offset that might arise from resuming from a checkpoint + max_step_reached = __builtins__.max([int(s) for s in value.keys()]) + ## this is the number of steps that occurred prior to resuming + offset = max_step_reached - len(value) + + num_elem = len(value) + if range_start < 0: + range_start += num_elem + 1 + offset + if range_end <= 0: + range_end += num_elem + 1 + offset + + vals = [] + for step, v in value.items(): + if range_start <= int(step) and int(step) < range_end: + vals.append(float(v)) + + print(vals) + return statistics.mean(vals) def evaluate_check(data: dict, check: str) -> tuple[bool, str, object]: diff --git a/tests/functional/dpo_megatron.sh b/tests/functional/dpo_megatron.sh new file mode 100755 index 0000000000..8c1524c2c5 --- /dev/null +++ b/tests/functional/dpo_megatron.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# clean up checkpoint directory on exit +trap "rm -rf /tmp/sft_checkpoints" EXIT + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetches metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +set -eou pipefail + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR + +cd $PROJECT_ROOT +uv run $PROJECT_ROOT/examples/run_dpo.py \ + --config $PROJECT_ROOT/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml \ + policy.model_name=Qwen/Qwen3-0.6B \ + cluster.gpus_per_node=2 \ + dpo.max_num_steps=3 \ + dpo.val_batches=1 \ + dpo.val_period=3 \ + logger.tensorboard_enabled=true \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=false \ + logger.monitor_gpus=true \ + checkpointing.enabled=false \ + policy.megatron_cfg.tensor_model_parallel_size=1 \ + policy.train_global_batch_size=8 \ + $@ \ + 2>&1 | tee $RUN_LOG + +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["3"] < 5' \ + diff --git a/tests/functional/sft_megatron.sh b/tests/functional/sft_megatron.sh new file mode 100755 index 0000000000..dfb7fcfdba --- /dev/null +++ b/tests/functional/sft_megatron.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# clean up checkpoint directory on exit +trap "rm -rf /tmp/sft_checkpoints" EXIT + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetches metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +set -eou pipefail + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR + +cd $PROJECT_ROOT +uv run $PROJECT_ROOT/examples/run_sft.py \ + --config $PROJECT_ROOT/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml \ + policy.model_name=Qwen/Qwen3-0.6B \ + policy.tokenizer.name=Qwen/Qwen3-0.6B \ + cluster.gpus_per_node=2 \ + sft.max_num_steps=3 \ + sft.val_batches=1 \ + sft.val_period=3 \ + logger.tensorboard_enabled=true \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=false \ + logger.monitor_gpus=true \ + checkpointing.enabled=false \ + policy.megatron_cfg.pipeline_model_parallel_size=1 \ + $@ \ + 2>&1 | tee $RUN_LOG + +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["3"] < 0.8' \ + diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh index f5b29b7db7..0162bd8bb9 100755 --- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh +++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh @@ -38,5 +38,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["20"] < 3.4' \ 'data["train/preference_loss"]["1"] > 0.69314' \ 'data["train/preference_loss"]["1"] < 0.69316' \ - 'data["train/preference_loss"]["20"] < 0.6' + 'data["train/preference_loss"]["20"] < 0.6' \ + 'mean(data["timing/train/total_step_time"], -10, -1) < 7.8' fi diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh similarity index 91% rename from tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.sh rename to tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh index e9ccb1e147..df74127ba2 100755 --- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.sh +++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh @@ -38,5 +38,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["150"] < 3.0' \ 'data["train/preference_loss"]["1"] > 0.69314' \ 'data["train/preference_loss"]["1"] < 0.69316' \ - 'data["train/preference_loss"]["150"] < 0.4' + 'data["train/preference_loss"]["150"] < 0.4' \ + 'mean(data["timing/train/total_step_time"], -11, -1) < 24' fi diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh similarity index 91% rename from tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.sh rename to tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh index e9ccb1e147..8701d63d1f 100755 --- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.sh +++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh @@ -38,5 +38,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["150"] < 3.0' \ 'data["train/preference_loss"]["1"] > 0.69314' \ 'data["train/preference_loss"]["1"] < 0.69316' \ - 'data["train/preference_loss"]["150"] < 0.4' + 'data["train/preference_loss"]["150"] < 0.4' \ + 'mean(data["timing/train/total_step_time"], -11, -1) < 11.5' fi diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh index f5b29b7db7..0bc8e13e28 100755 --- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh +++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh @@ -38,5 +38,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["20"] < 3.4' \ 'data["train/preference_loss"]["1"] > 0.69314' \ 'data["train/preference_loss"]["1"] < 0.69316' \ - 'data["train/preference_loss"]["20"] < 0.6' + 'data["train/preference_loss"]["20"] < 0.6' \ + 'mean(data["timing/train/total_step_time"], -10) < 6.7' fi diff --git a/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh b/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh index 6606099df7..48691c0df4 100755 --- a/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh +++ b/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh @@ -36,5 +36,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["1"] > 0.69314' \ 'data["train/loss"]["1"] < 0.69316' \ - 'data["train/loss"]["150"] < 0.55' + 'data["train/loss"]["150"] < 0.55' \ + 'mean(data["timing/train/total_step_time"], -11, -1) < 1.3' fi diff --git a/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh index 94781e4931..4624b7282d 100755 --- a/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh +++ b/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh @@ -35,5 +35,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - "data[\"train/token_mult_prob_error\"][\"${MAX_STEPS}\"] < 1.1" + "data[\"train/token_mult_prob_error\"][\"${MAX_STEPS}\"] < 1.1" \ + 'mean(data["timing/train/total_step_time"], -6, -1) < 14' fi diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh index 45cfad6e83..3661370fa6 100755 --- a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh +++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh @@ -35,6 +35,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["500"] < 1.1' + 'data["train/token_mult_prob_error"]["500"] < 1.1' \ + 'mean(data["timing/train/total_step_time"], -6, -1) < 10' fi diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh new file mode 100755 index 0000000000..83071c70e3 --- /dev/null +++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh @@ -0,0 +1,42 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=500 +MAX_STEPS=500 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=180 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["500"] < 1.1' \ + 'data["train/reward"]["500"] > 0.1' \ + 'mean(data["timing/train/total_step_time"], -6, -1) < 10.5' + +fi diff --git a/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh new file mode 100755 index 0000000000..7288252eec --- /dev/null +++ b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh @@ -0,0 +1,41 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=30 +MAX_STEPS=30 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=150 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +PYTHONPATH=$HF_HOME/modules:$PYTHONPATH uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["30"] < 1.1' \ + 'mean(data["train/reward"]) > 0.45' \ + 'mean(data["timing/train/total_step_time"], -11, -1) < 70' +fi diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh new file mode 100755 index 0000000000..45f354043a --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh @@ -0,0 +1,41 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=30 +MAX_STEPS=30 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=180 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["30"] < 1.1' \ + 'mean(data["train/reward"]) > 0.56' \ + 'mean(data["timing/train/total_step_time"], 2) < 50' +fi diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh index 98df00c25c..0a31e74590 100755 --- a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh +++ b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh @@ -35,6 +35,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["450"] < 1.1' + 'data["train/token_mult_prob_error"]["450"] < 1.1' \ + 'mean(data["timing/train/total_step_time"], 2) < 25' fi diff --git a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh new file mode 100755 index 0000000000..f89041cd40 --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=8 +STEPS_PER_RUN=30 +MAX_STEPS=30 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/token_mult_prob_error"]["30"] < 1.1' \ + 'data["train/reward"]["30"] > 0.43' \ + 'mean(data["timing/train/total_step_time"], -6, -1) < 220' +fi diff --git a/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh b/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh new file mode 100755 index 0000000000..718322e33a --- /dev/null +++ b/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh @@ -0,0 +1,42 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=8 +STEPS_PER_RUN=300 +MAX_STEPS=300 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# TODO: the memory check is known to OOM. see https://github.com/NVIDIA-NeMo/RL/issues/263 +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 0.55' \ + 'data["train/loss"]["300"] < 0.285' \ + 'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \ + 'mean(data["timing/train/total_step_time"], 2) < 20' +fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh similarity index 81% rename from tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh rename to tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh index b22c00dec0..76c600c648 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh @@ -2,11 +2,10 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env -# TODO: @ashors real convergence run (dataset only has 2737) # ===== BEGIN CONFIG ===== NUM_NODES=1 -STEPS_PER_RUN=2730 -MAX_STEPS=2730 +STEPS_PER_RUN=250 +MAX_STEPS=250 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=120 # ===== END CONFIG ===== @@ -35,9 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # TODO: the memory check is known to OOM. see https://github.com/NVIDIA-NeMo/RL/issues/263 # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - # TODO: FIGURE OUT CORRECT METRICS uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 5' \ - 'data["train/loss"]["2730"] < 0.3' \ - 'max(data["ray/node.0.gpu.0.mem_gb"]) < 50' + 'data["train/loss"]["1"] < 0.6' \ + 'data["train/loss"]["250"] < 0.36' \ + 'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \ + 'mean(data["timing/train/total_step_time"], 2) < 10' fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh new file mode 100755 index 0000000000..90fd03467c --- /dev/null +++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh @@ -0,0 +1,42 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=250 +MAX_STEPS=250 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# TODO: the memory check is known to OOM. see https://github.com/NVIDIA-NeMo/RL/issues/263 +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 0.6' \ + 'data["train/loss"]["250"] < 0.36' \ + 'max(data["ray/node.0.gpu.0.mem_gb"]) < 80' \ + 'mean(data["timing/train/total_step_time"], 2) < 22' +fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh similarity index 83% rename from tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh rename to tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh index abed80e5ed..8f69d0f0b8 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh @@ -4,8 +4,8 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=1 -STEPS_PER_RUN=350 -MAX_STEPS=350 +STEPS_PER_RUN=50 +MAX_STEPS=50 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=45 # ===== END CONFIG ===== @@ -35,9 +35,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - # TODO: FIGURE OUT CORRECT METRICS uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 5' \ - 'data["train/loss"]["350"] < 0.5' \ - 'max(data["ray/node.0.gpu.0.mem_gb"]) < 45' + 'data["train/loss"]["1"] < 0.6' \ + 'data["train/loss"]["50"] < 0.38' \ + 'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \ + 'mean(data["timing/train/total_step_time"], 2) < 32' fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-megatron.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh similarity index 87% rename from tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-megatron.sh rename to tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh index cf72bd9377..fe54af1fbd 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-megatron.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh @@ -31,9 +31,9 @@ uv run examples/run_sft.py \ # Convert tensorboard logs to json uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -# TODO: @ashors tighter bounds if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 2' \ - 'data["train/loss"]["250"] < 0.3' -fi \ No newline at end of file + 'data["train/loss"]["1"] < 0.6' \ + 'data["train/loss"]["250"] < 0.36' \ + 'mean(data["timing/train/total_step_time"], 2) < 6' +fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh new file mode 100755 index 0000000000..bc5eae73a2 --- /dev/null +++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh @@ -0,0 +1,39 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=250 +MAX_STEPS=250 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 0.6' \ + 'data["train/loss"]["250"] < 0.36' \ + 'mean(data["timing/train/total_step_time"], 2) < 20' +fi diff --git a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.sh b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh similarity index 82% rename from tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.sh rename to tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh index 32c66dae04..a4b44bd1f1 100755 --- a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.sh +++ b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh @@ -4,8 +4,8 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=1 -STEPS_PER_RUN=500 -MAX_STEPS=500 +STEPS_PER_RUN=250 +MAX_STEPS=250 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=15 # ===== END CONFIG ===== @@ -34,8 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 2.4' \ - 'data["train/loss"]["500"] < 0.5' \ - 'max(data["ray/node.0.gpu.0.mem_gb"]) < 25' + 'data["train/loss"]["1"] < 0.82' \ + 'data["train/loss"]["250"] < 0.5' \ + 'max(data["ray/node.0.gpu.0.mem_gb"]) < 25' \ + 'mean(data["timing/train/total_step_time"], -6, -1) < 0.6' fi diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh similarity index 96% rename from tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh rename to tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh index 257add6fc5..d16a3d8d98 100755 --- a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh +++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh @@ -37,7 +37,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 1.5' \ + 'data["train/loss"]["1"] < 0.37' \ 'data["train/loss"]["20"] < 0.3' \ 'max(data["ray/node.0.gpu.0.mem_gb"]) < 35' fi diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index 07c3eb5b9c..def1d87d94 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -10,29 +10,41 @@ tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh # Dtensor (Qwen/Qwen2.5-7B-Instruct) tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.sh +# Megatron +tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh +tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh + # Functional 32b run tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.sh +# Functional moonlight run +tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh + # Deepscaler (short tests) tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh +>>>>>>> 2b87def7971f01d6060a5dfc3b9e2df58f832922 ####### # SFT # ####### # 1N 1B/8B runs -tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v2.sh +tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh # Dtensor (8B) -tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh +tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.sh +# dynamic batching +tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh # Functional 32b test -tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh +tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh # Megatron -tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-megatron.sh +tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh +# sequence packing +tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh ####### # DPO # diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt index e339ef0bc1..e58f5b4d71 100644 --- a/tests/test_suites/release.txt +++ b/tests/test_suites/release.txt @@ -11,17 +11,23 @@ tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.sh # Long Gemma3 27b run tests/test_suites/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.sh +# Long Megatron Qwen3 30B-A3B run +tests/test_suites/llm/grpo-qwen3-30ba3b-16n8g-megatron.sh + ####### # SFT # ####### # Long 8b convergence -tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh +tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh + +# 300 step 70b convergence +tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh ####### # DPO # ####### # Long 8b convergence -tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.sh -tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.sh +tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh +tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh diff --git a/tests/unit/models/megatron/converters/test_common.py b/tests/unit/models/megatron/converters/test_common.py new file mode 100755 index 0000000000..d98f1fe905 --- /dev/null +++ b/tests/unit/models/megatron/converters/test_common.py @@ -0,0 +1,245 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import Mock, patch + +import torch + +from nemo_rl.models.megatron.converters.common import ( + get_global_expert_num, + get_global_key_from_local_key, + get_global_layer_num, + get_local_expert_num, + get_local_layer_num, + split_fc1_etp, + split_fc1_tp, + split_qkv_bias_gpu, + split_qkv_gpu, + update_transforms_for_nemorl, +) + + +class TestLayerNumberFunctions: + """Test functions related to layer number extraction and conversion.""" + + def test_get_local_layer_num_valid(self): + """Test get_local_layer_num with valid layer keys.""" + assert get_local_layer_num("layers.5.attention.weight") == 5 + assert get_local_layer_num("decoder.layers.10.mlp.weight") == 10 + assert get_local_layer_num("model.layers.0.self_attn.weight") == 0 + + def test_get_local_layer_num_invalid(self): + """Test get_local_layer_num with invalid layer keys.""" + assert get_local_layer_num("attention.weight") is None + assert get_local_layer_num("layers.abc.weight") is None + assert get_local_layer_num("layers.") is None + + def test_get_global_layer_num_pp(self): + """Test get_global_layer_num with simple pipeline configuration.""" + mock_cfg = Mock() + mock_cfg.num_layers = 10 + mock_cfg.num_layers_in_first_pipeline_stage = 4 + mock_cfg.num_layers_in_last_pipeline_stage = 3 + + with patch( + "nemo_rl.models.megatron.converters.common.parallel_state" + ) as mock_ps: + mock_ps.get_pipeline_model_parallel_rank.return_value = 1 + mock_ps.get_pipeline_model_parallel_world_size.return_value = 3 + + result = get_global_layer_num("layers.2.weight", mock_cfg) + assert result == 6 + + +class TestExpertNumberFunctions: + """Test functions related to expert number extraction and conversion.""" + + def test_get_local_expert_num_valid(self): + """Test get_local_expert_num with valid expert keys.""" + assert get_local_expert_num("layers.0.mlp.experts.weight2") == 2 + assert get_local_expert_num("decoder.layers.1.experts.weight5") == 5 + assert get_local_expert_num("model.layers.0.experts.weight0") == 0 + + def test_get_local_expert_num_invalid(self): + """Test get_local_expert_num with invalid expert keys.""" + assert get_local_expert_num("layers.0.mlp.weight") is None + assert get_local_expert_num("layers.0.mlp.experts.2._extra_state") is None + + def test_get_global_expert_num(self): + """Test get_global_expert_num with expert parallel configuration.""" + mock_cfg = Mock() + mock_cfg.num_moe_experts = 8 + + with patch( + "nemo_rl.models.megatron.converters.common.parallel_state" + ) as mock_ps: + mock_ps.get_expert_model_parallel_rank.return_value = 1 + mock_ps.get_expert_model_parallel_world_size.return_value = 2 + + result = get_global_expert_num("layers.0.mlp.experts.weight2", mock_cfg) + assert result == 6 # 8 // 2 + 2 + + +class TestKeyConversionFunctions: + """Test functions related to key conversion between local and global.""" + + def test_get_global_key_from_local_key_layer_only(self): + """Test key conversion with only layer numbers.""" + mock_cfg = Mock() + mock_cfg.num_layers = 12 + mock_cfg.num_layers_in_first_pipeline_stage = None + mock_cfg.num_layers_in_last_pipeline_stage = None + + with patch( + "nemo_rl.models.megatron.converters.common.parallel_state" + ) as mock_ps: + mock_ps.get_pipeline_model_parallel_rank.return_value = 1 + mock_ps.get_pipeline_model_parallel_world_size.return_value = 2 + + result = get_global_key_from_local_key( + "layers.3.attention.weight", mock_cfg + ) + assert result == "layers.9.attention.weight" + + def test_get_global_key_from_local_key_expert_and_layer(self): + """Test key conversion with only expert numbers.""" + mock_cfg = Mock() + mock_cfg.num_moe_experts = 8 + mock_cfg.num_layers = 12 + mock_cfg.num_layers_in_first_pipeline_stage = None + mock_cfg.num_layers_in_last_pipeline_stage = None + + with patch( + "nemo_rl.models.megatron.converters.common.parallel_state" + ) as mock_ps: + mock_ps.get_expert_model_parallel_rank.return_value = 1 + mock_ps.get_expert_model_parallel_world_size.return_value = 2 + + mock_ps.get_pipeline_model_parallel_rank.return_value = 1 + mock_ps.get_pipeline_model_parallel_world_size.return_value = 3 + + result = get_global_key_from_local_key( + "layers.0.mlp.experts.weight2", mock_cfg + ) + assert result == "layers.4.mlp.experts.weight6" + + +class TestTensorSplittingFunctions: + """Test functions related to tensor splitting operations.""" + + def test_split_fc1_tp(self): + """Test split_fc1_tp function.""" + mock_ctx = Mock() + mock_ctx.source.config.tensor_model_parallel_size = 2 + + # Create a tensor with shape (4, 10) representing 2 TP ranks with 2 components each + linear_fc1 = torch.randn(4, 10) + + gate_proj, up_proj = split_fc1_tp(mock_ctx, linear_fc1) + + assert gate_proj.shape == (2, 10) + assert up_proj.shape == (2, 10) + assert torch.allclose(gate_proj, linear_fc1[::2]) + assert torch.allclose(up_proj, linear_fc1[1::2]) + + def test_split_fc1_etp(self): + """Test split_fc1_etp function.""" + mock_ctx = Mock() + mock_ctx.source.config.expert_tensor_parallel_size = 2 + + # Create a tensor with shape (4, 10) representing 2 ETP ranks with 2 components each + linear_fc1 = torch.randn(4, 10) + + gate_proj, up_proj = split_fc1_etp(mock_ctx, linear_fc1) + + assert gate_proj.shape == (2, 10) + assert up_proj.shape == (2, 10) + assert torch.allclose(gate_proj, linear_fc1[::2]) + assert torch.allclose(up_proj, linear_fc1[1::2]) + + def test_split_qkv_gpu(self): + """Test split_qkv_gpu function.""" + mock_ctx = Mock() + mock_ctx.source.config.num_attention_heads = 8 + mock_ctx.source.config.num_query_groups = 2 + mock_ctx.source.config.kv_channels = 16 + + # Create QKV tensor: (heads + 2*groups) * head_size * hidden_size + qkv_total_dim = 8 + 2 * 2 # 12 + linear_qkv = torch.randn(qkv_total_dim, 16, 64) + + q_proj, k_proj, v_proj = split_qkv_gpu(mock_ctx, linear_qkv) + + # Q should have 8 heads * 16 channels = 128 + assert q_proj.shape == (128, 64) + # K and V should have 2 groups * 16 channels = 32 each + assert k_proj.shape == (32, 64) + assert v_proj.shape == (32, 64) + + def test_split_qkv_bias_gpu(self): + """Test split_qkv_bias_gpu function.""" + mock_ctx = Mock() + mock_ctx.source.config.num_attention_heads = 8 + mock_ctx.source.config.num_query_groups = 2 + mock_ctx.source.config.kv_channels = 16 + + # Create QKV bias tensor: (heads + 2*groups) * head_size + qkv_total_dim = 8 + 2 * 2 # 12 + qkv_bias = torch.randn(qkv_total_dim, 16) + + q_bias, k_bias, v_bias = split_qkv_bias_gpu(mock_ctx, qkv_bias) + + # Q should have 8 heads * 16 channels = 128 + assert q_bias.shape == (128,) + # K and V should have 2 groups * 16 channels = 32 each + assert k_bias.shape == (32,) + assert v_bias.shape == (32,) + + +class TestTransformUpdateFunctions: + """Test functions related to transform updates.""" + + def test_update_transforms_for_nemorl(self): + """Test update_transforms_for_nemorl function.""" + # Create mock transforms + mock_transform1 = Mock() + mock_transform1.transform.__name__ = "split_fc1" + mock_transform1.source_key = "layers.0.mlp.experts.0.linear_fc1.weight" + + mock_transform2 = Mock() + mock_transform2.transform.__name__ = "split_fc1" + mock_transform2.source_key = "layers.0.mlp.shared_experts.linear_fc1.weight" + + mock_transform3 = Mock() + mock_transform3.transform.__name__ = "split_qkv" + + mock_transform4 = Mock() + mock_transform4.transform.__name__ = "split_qkv_bias" + + transforms = [ + mock_transform1, + mock_transform2, + mock_transform3, + mock_transform4, + ] + + updated_transforms = update_transforms_for_nemorl(transforms) + + # Check that expert transforms use split_fc1_etp + assert updated_transforms[0].transform == split_fc1_etp + # Check that non-expert transforms use split_fc1_tp + assert updated_transforms[1].transform == split_fc1_tp + # Check that qkv transforms are updated + assert updated_transforms[2].transform == split_qkv_gpu + assert updated_transforms[3].transform == split_qkv_bias_gpu From 3fe289816aa6591671758d67e22d73671953265a Mon Sep 17 00:00:00 2001 From: ashors1 Date: Tue, 12 Aug 2025 14:55:56 -0700 Subject: [PATCH 02/12] rename unit test file to avoid name conflict Signed-off-by: ashors1 --- .../models/megatron/{test_common.py => test_megatron_common.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/unit/models/megatron/{test_common.py => test_megatron_common.py} (100%) diff --git a/tests/unit/models/megatron/test_common.py b/tests/unit/models/megatron/test_megatron_common.py similarity index 100% rename from tests/unit/models/megatron/test_common.py rename to tests/unit/models/megatron/test_megatron_common.py From 473f408aac38eab3e9557109c8aaf49c8c3c4af7 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 13 Aug 2025 10:35:16 -0700 Subject: [PATCH 03/12] rename tests Signed-off-by: ashors1 --- .../converters/{test_common.py => test_converters_common.py} | 0 .../models/megatron/{test_megatron_common.py => test_common.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/unit/models/megatron/converters/{test_common.py => test_converters_common.py} (100%) rename tests/unit/models/megatron/{test_megatron_common.py => test_common.py} (100%) diff --git a/tests/unit/models/megatron/converters/test_common.py b/tests/unit/models/megatron/converters/test_converters_common.py similarity index 100% rename from tests/unit/models/megatron/converters/test_common.py rename to tests/unit/models/megatron/converters/test_converters_common.py diff --git a/tests/unit/models/megatron/test_megatron_common.py b/tests/unit/models/megatron/test_common.py similarity index 100% rename from tests/unit/models/megatron/test_megatron_common.py rename to tests/unit/models/megatron/test_common.py From 23966de8955938b0a9d868134afa71d6a256818f Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 14 Aug 2025 09:03:17 -0700 Subject: [PATCH 04/12] fix unit tests Signed-off-by: ashors1 --- .../unit/models/megatron/converters/test_converters_common.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/unit/models/megatron/converters/test_converters_common.py b/tests/unit/models/megatron/converters/test_converters_common.py index d98f1fe905..1177f53af1 100755 --- a/tests/unit/models/megatron/converters/test_converters_common.py +++ b/tests/unit/models/megatron/converters/test_converters_common.py @@ -14,6 +14,7 @@ from unittest.mock import Mock, patch +import pytest import torch from nemo_rl.models.megatron.converters.common import ( @@ -29,6 +30,9 @@ update_transforms_for_nemorl, ) +# Apply mcore marker to all tests in this module +pytestmark = pytest.mark.mcore + class TestLayerNumberFunctions: """Test functions related to layer number extraction and conversion.""" From 1725226e50757b39bf07b3a0c085788d8ad8a286 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 14 Aug 2025 15:17:10 -0700 Subject: [PATCH 05/12] remove debug code Signed-off-by: ashors1 --- tests/check_metrics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/check_metrics.py b/tests/check_metrics.py index df97d22a1f..bc9f6ced04 100644 --- a/tests/check_metrics.py +++ b/tests/check_metrics.py @@ -55,7 +55,6 @@ def mean(value, range_start=1, range_end=0): if range_start <= int(step) and int(step) < range_end: vals.append(float(v)) - print(vals) return statistics.mean(vals) From 3a4d9a8e1c54f8749d4fcb7af5f79c971ea6d39f Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 14 Aug 2025 15:23:58 -0700 Subject: [PATCH 06/12] fix import error Signed-off-by: ashors1 --- .../converters/test_converters_common.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/unit/models/megatron/converters/test_converters_common.py b/tests/unit/models/megatron/converters/test_converters_common.py index 1177f53af1..c8731eb573 100755 --- a/tests/unit/models/megatron/converters/test_converters_common.py +++ b/tests/unit/models/megatron/converters/test_converters_common.py @@ -17,18 +17,21 @@ import pytest import torch -from nemo_rl.models.megatron.converters.common import ( - get_global_expert_num, - get_global_key_from_local_key, - get_global_layer_num, - get_local_expert_num, - get_local_layer_num, - split_fc1_etp, - split_fc1_tp, - split_qkv_bias_gpu, - split_qkv_gpu, - update_transforms_for_nemorl, -) +try: + from nemo_rl.models.megatron.converters.common import ( + get_global_expert_num, + get_global_key_from_local_key, + get_global_layer_num, + get_local_expert_num, + get_local_layer_num, + split_fc1_etp, + split_fc1_tp, + split_qkv_bias_gpu, + split_qkv_gpu, + update_transforms_for_nemorl, + ) +except ImportError: + pass # Apply mcore marker to all tests in this module pytestmark = pytest.mark.mcore From ad3f1d540211978f0a24e8f5a7bf2db28b69d7dd Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 14 Aug 2025 22:00:06 -0700 Subject: [PATCH 07/12] fix config keys Signed-off-by: ashors1 --- examples/configs/dpo.yaml | 1 - ....1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml | 1 - ...po-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml | 1 - ...llama3.1-8b-instruct-4n8g-megatron.v2.yaml | 1 - ...8b-instruct-4n8g-megatrontp2pp2-quick.yaml | 1 - ...llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml | 1 - ...po-llama3.2-1b-instruct-1n8g-megatron.yaml | 2 - ...rpo-qwen2.5-7b-instruct-4n8g-megatron.yaml | 2 - .../llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml | 2 - ...lama3.1-70b-8n8g-tp4pp2-long-megatron.yaml | 4 +- ...lama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml | 2 - .../sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml | 1 - .../llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml | 1 - ...sft-llama3.1-8b-1n8g-megatron-seqpack.yaml | 2 - .../llm/sft-llama3.1-8b-1n8g-megatron.yaml | 1 - .../llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml | 1 - ...wen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml | 1 - examples/configs/rm.yaml | 2 - examples/configs/sft.yaml | 1 - examples/configs/sft_openmathinstruct2.yaml | 1 - .../sft_openmathinstruct2_megatron.yaml | 149 ++++++++++++++++++ nemo_rl/data/__init__.py | 1 + nemo_rl/utils/logger.py | 1 - 23 files changed, 151 insertions(+), 29 deletions(-) create mode 100644 examples/configs/sft_openmathinstruct2_megatron.yaml diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index ecc159b484..74a74efc21 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -158,7 +158,6 @@ logger: tensorboard_enabled: false mlflow_enabled: false # Disable MLflow logging monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: "dpo-dev" name: "dpo" diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml index b7040abb37..72dcb9ad1e 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml @@ -82,7 +82,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: true - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml index 01395f5247..22851b368c 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml @@ -82,7 +82,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: true - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml index 49e8a78422..1960502a09 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml @@ -115,7 +115,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: true - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml index 37d20bbcdb..987e70dc88 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml @@ -115,7 +115,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: true - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1 diff --git a/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml index 1e35e24d4e..22870f0e66 100644 --- a/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml @@ -83,7 +83,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: true - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl name: dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1 diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml index 55de4c59f4..8f17d32819 100755 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml @@ -36,8 +36,6 @@ policy: logprob_batch_size: 4 max_total_sequence_length: 512 precision: bfloat16 - fsdp_offload_enabled: false - activation_checkpointing_enabled: false optimizer: null megatron_cfg: enabled: true diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml index d099931839..13689e6ddc 100755 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml @@ -36,8 +36,6 @@ policy: logprob_batch_size: 2 max_total_sequence_length: 4096 precision: bfloat16 - fsdp_offload_enabled: false - activation_checkpointing_enabled: false dtensor_cfg: enabled: false megatron_cfg: diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml index 89e434d3dc..048ed32782 100755 --- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml @@ -38,8 +38,6 @@ policy: logprob_batch_size: 4 max_total_sequence_length: 4096 precision: "bfloat16" - fsdp_offload_enabled: false - activation_checkpointing_enabled: false dtensor_cfg: enabled: false diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml index 08b015512a..cd5751f523 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml @@ -23,8 +23,6 @@ policy: train_micro_batch_size: 1 max_total_sequence_length: 4096 precision: "bfloat16" - fsdp_offload_enabled: false - activation_checkpointing_enabled: false dtensor_cfg: enabled: false megatron_cfg: @@ -114,8 +112,8 @@ data: add_eos: true add_generation_prompt: true output_key: 'generated_solution' - seed: 42 shuffle: true + seed: 42 logger: log_dir: "logs" # Base directory for all logs wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml index b04201ac9f..d7906b82e0 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml @@ -23,8 +23,6 @@ policy: train_micro_batch_size: 2 max_total_sequence_length: 4096 precision: bfloat16 - fsdp_offload_enabled: false - activation_checkpointing_enabled: false dtensor_cfg: enabled: true cpu_offload: false diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml index 20adece59d..1fc0ccec7c 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml @@ -70,7 +70,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: true - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml index e9abb0771f..8c3f14b531 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml @@ -68,7 +68,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: true - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml index c2791b3074..4ad9355446 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml @@ -23,8 +23,6 @@ policy: train_micro_batch_size: 2 max_total_sequence_length: 4096 precision: bfloat16 - fsdp_offload_enabled: false - activation_checkpointing_enabled: false dtensor_cfg: enabled: false dynamic_batching: diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml index 28331f4c69..e5e86dd302 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml @@ -112,7 +112,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: true - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl name: sft-llama3.1-8b-1n8g-megatron diff --git a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml index de2e6f9eee..165e2fa9a3 100644 --- a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml +++ b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml @@ -69,7 +69,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: true - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl name: sft-llama3.2-1b-1n8g-fsdp2tp1 diff --git a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml index 85f4e7f111..800d94711e 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml @@ -68,7 +68,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: true - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: nemo-rl name: sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt diff --git a/examples/configs/rm.yaml b/examples/configs/rm.yaml index 20d4cf6a18..4adffdc5d7 100644 --- a/examples/configs/rm.yaml +++ b/examples/configs/rm.yaml @@ -31,8 +31,6 @@ policy: train_micro_batch_size: 1 max_total_sequence_length: 8192 precision: "bfloat16" - fsdp_offload_enabled: false - activation_checkpointing_enabled: false reward_model_cfg: enabled: true # loads model as a Reward Model (do not change) diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 01864d7691..cd7232527c 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -141,7 +141,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: "sft-dev" name: "sft-dev-${data.dataset_name}" diff --git a/examples/configs/sft_openmathinstruct2.yaml b/examples/configs/sft_openmathinstruct2.yaml index bbea9f1767..09354a2039 100644 --- a/examples/configs/sft_openmathinstruct2.yaml +++ b/examples/configs/sft_openmathinstruct2.yaml @@ -82,7 +82,6 @@ logger: tensorboard_enabled: true mlflow_enabled: false monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb: project: "sft-dev" name: "openmathinstruct-nemorl-1M_train" diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml new file mode 100644 index 0000000000..17b7ddeaee --- /dev/null +++ b/examples/configs/sft_openmathinstruct2_megatron.yaml @@ -0,0 +1,149 @@ +# SFT Algorithm Configuration +defaults: sft_openmathinstruct2.yaml + +sft: + max_num_epochs: 1 + max_num_steps: 1000000 + val_period: 500 + val_batches: 4 + val_global_batch_size: 128 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 + +checkpointing: + enabled: true + checkpoint_dir: "results/sft_openmathinstruct2" + metric_name: "val_loss" + higher_is_better: false + keep_top_k: 100 + save_period: 500 + +policy: + model_name: "meta-llama/Llama-3.1-8B" + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 1 + max_total_sequence_length: 4096 + precision: "bfloat16" + + dtensor_cfg: + enabled: false + + megatron_cfg: + activation_checkpointing: false + context_parallel_size: 1 + distributed_data_parallel_config: + average_in_collective: true + data_parallel_sharding_strategy: optim_grads_params + grad_reduce_in_fp32: true + overlap_grad_reduce: true + overlap_param_gather: true + empty_unused_memory_level: 1 + enabled: true + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 1 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + optimizer: + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1.0e-8 + bf16: true + clip_grad: 0 + fp16: false + lr: 0.00002 + min_lr: 0.00002 + optimizer: adam + params_dtype: bfloat16 + sgd_momentum: 0.9 + use_distributed_optimizer: true + use_precision_aware_optimizer: false #true ## TODO: precision aware optim not working with fp8. Is this expected? + weight_decay: 0.01 + + ## recently introduced, our current mcore commit doesn't have this + #fp8_recipe: delayed + + pipeline_dtype: bfloat16 + pipeline_model_parallel_size: 1 + scheduler: + end_weight_decay: 0.01 + lr_decay_iters: null + lr_decay_style: constant + lr_warmup_init: 0.00001999999 + lr_warmup_iters: 1 + start_weight_decay: 0.01 + weight_decay_incr_style: constant + sequence_parallel: false + tensor_model_parallel_size: 4 ## TODO: should not need this large TP size + + freeze_moe_router: true + moe_router_dtype: "fp64" + moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo + moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + + env_vars: + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False" + + fp8_cfg: + enabled: true + fp8: hybrid + fp8_recipe: delayed + fp8_param: true # false gives the following error: "RuntimeError: /TransformerEngine/transformer_engine/common/gemm/cublaslt_gemm.cu:116 in function CanonicalizeGemmInput: Assertion failed: !is_fp8_dtype(ret.Atype). Input A is missing column-wise usage" + fp8_dot_product_attention: false #true + fp8_multi_head_attention: false #true + + dynamic_batching: + enabled: false + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + sequence_length_round: 64 + + + sequence_packing: + enabled: True + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + + # makes the training sequence length divisible by the tensor parallel size + # this is useful for sequence parallel training + make_sequence_length_divisible_by: ${mul:16, ${policy.megatron_cfg.tensor_model_parallel_size}} + max_grad_norm: null + + optimizer: null + +data: + max_input_seq_length: ${policy.max_total_sequence_length} + dataset_name: "openmathinstruct2" + prompt_file: examples/prompts/math.txt + split: "train_1M" + add_bos: true + add_eos: true + add_generation_prompt: true + output_key: 'generated_solution' + +logger: + log_dir: "logs" # Base directory for all logs + wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running + tensorboard_enabled: true + mlflow_enabled: false + monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard + wandb: + project: "sft-openmathinstruct-megatron" + name: "llama8b" + tensorboard: + log_dir: "tb_logs-openmathinstruct-nemorl-1M_train" + mlflow: + experiment_name: "sft-dev" + run_name: "openmathinstruct-nemorl-1M_train" + gpu_monitoring: + collection_interval: 10 # How often to collect GPU usage metrics (in seconds) + flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) + +cluster: + gpus_per_node: 8 + num_nodes: 2 + diff --git a/nemo_rl/data/__init__.py b/nemo_rl/data/__init__.py index df14a1546f..257a2db66f 100644 --- a/nemo_rl/data/__init__.py +++ b/nemo_rl/data/__init__.py @@ -29,6 +29,7 @@ class DataConfig(TypedDict): add_system_prompt: NotRequired[bool] split: NotRequired[str] shuffle: NotRequired[bool] + seed: NotRequired[int] class MathDataConfig(DataConfig): diff --git a/nemo_rl/utils/logger.py b/nemo_rl/utils/logger.py index 4cf2621cd4..e70af2117f 100644 --- a/nemo_rl/utils/logger.py +++ b/nemo_rl/utils/logger.py @@ -76,7 +76,6 @@ class LoggerConfig(TypedDict): mlflow: NotRequired[MLflowConfig] monitor_gpus: bool gpu_monitoring: GPUMonitoringConfig - num_val_samples_to_print: int class LoggerInterface(ABC): From 59c896b39a1bdfaa1581e29bb0c80733a29ba680 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 20 Aug 2025 08:19:13 -0700 Subject: [PATCH 08/12] fix conflict Signed-off-by: ashors1 --- tests/test_suites/nightly.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index 9abed7b02e..e4c488bf6f 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -28,7 +28,6 @@ tests/test_suites/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1. tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh ->>>>>>> 2b87def7971f01d6060a5dfc3b9e2df58f832922 # GRPO math test run (32K context mcore) tests/test_suites/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.sh From 30328425c44d69841e63121fa8496c8656ee0928 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 20 Aug 2025 11:56:48 -0700 Subject: [PATCH 09/12] fix unit test Signed-off-by: ashors1 --- examples/configs/grpo_math_1B.yaml | 2 ++ examples/configs/sft.yaml | 10 +++++++++- tests/test_suites/release.txt | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index b797afee17..384041607e 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -114,6 +114,8 @@ policy: use_custom_fsdp: false data_parallel_sharding_strategy: "optim_grads_params" + env_vars: null + # See docs/design-docs/sequence-packing-and-dynamic-batching.md # for more details on dynamic batching and sequence packing. dynamic_batching: diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index cd7232527c..2319568475 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -42,6 +42,8 @@ policy: dynamic_batching: enabled: false + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + sequence_length_round: 64 sequence_packing: enabled: False @@ -125,7 +127,7 @@ policy: overlap_param_gather: true average_in_collective: true data_parallel_sharding_strategy: "optim_grads_params" - + use_custom_fsdp: false data: max_input_seq_length: ${policy.max_total_sequence_length} @@ -135,6 +137,12 @@ data: add_generation_prompt: false shuffle: true + ## unused with squad dataset + prompt_file: null + split: null + output_key: null + seed: null + logger: log_dir: "logs" # Base directory for all logs wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt index 2afd3ccafe..bd117a83f8 100644 --- a/tests/test_suites/release.txt +++ b/tests/test_suites/release.txt @@ -12,7 +12,7 @@ tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.sh tests/test_suites/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.sh # Long Megatron Qwen3 30B-A3B run -tests/test_suites/llm/grpo-qwen3-30ba3b-16n8g-megatron.sh +tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh ####### # SFT # From fd723ea13bd5b6b26d0ca655b2fa225e3015f308 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 21 Aug 2025 09:22:27 -0700 Subject: [PATCH 10/12] fix empty env vars Signed-off-by: ashors1 --- examples/configs/grpo_math_1B.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 384041607e..2d22a94ffe 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -114,7 +114,7 @@ policy: use_custom_fsdp: false data_parallel_sharding_strategy: "optim_grads_params" - env_vars: null + env_vars: {} # See docs/design-docs/sequence-packing-and-dynamic-batching.md # for more details on dynamic batching and sequence packing. From 31d832a2b211ec07c9d0cfac89eebc6637fea924 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 21 Aug 2025 15:13:23 -0700 Subject: [PATCH 11/12] re-disable expandable segments for qwen30ba3b Signed-off-by: ashors1 --- examples/configs/grpo_math_qwen30ba3b_megatron.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml index 3040d20ffc..1a0cc651c7 100644 --- a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml +++ b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml @@ -55,7 +55,10 @@ policy: lr_decay_iters: null lr_warmup_iters: 13 lr_warmup_init: 3.0e-8 - + + env_vars: + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False" + generation: backend: "vllm" max_new_tokens: ${policy.max_total_sequence_length} From 84b460b34ffa7260c95b8af7a7be601cb410435f Mon Sep 17 00:00:00 2001 From: ashors1 Date: Fri, 22 Aug 2025 12:28:49 -0700 Subject: [PATCH 12/12] fix null env vars Signed-off-by: ashors1 --- examples/configs/grpo_math_1B.yaml | 2 +- nemo_rl/models/policy/lm_policy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 2d22a94ffe..384041607e 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -114,7 +114,7 @@ policy: use_custom_fsdp: false data_parallel_sharding_strategy: "optim_grads_params" - env_vars: {} + env_vars: null # See docs/design-docs/sequence-packing-and-dynamic-batching.md # for more details on dynamic batching and sequence packing. diff --git a/nemo_rl/models/policy/lm_policy.py b/nemo_rl/models/policy/lm_policy.py index c853e200d0..754faea5b7 100644 --- a/nemo_rl/models/policy/lm_policy.py +++ b/nemo_rl/models/policy/lm_policy.py @@ -133,7 +133,7 @@ def __init__( name_prefix=name_prefix, workers_per_node=workers_per_node, sharding_annotations=self.sharding_annotations, - env_vars=env_vars, + env_vars=env_vars or {}, ) if config["dynamic_batching"]["enabled"]: