diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index db6fb7fa6d..25f5b59ef2 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -98,7 +98,9 @@ policy: num_layers_in_first_pipeline_stage: null num_layers_in_last_pipeline_stage: null sequence_parallel: true - + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + optimizer: optimizer: "adam" lr: 5.0e-6 #4.0e-5 diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index 237fbb0df1..e9bcbf20b8 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -79,7 +79,9 @@ policy: context_parallel_size: 1 pipeline_dtype: ${policy.precision} sequence_parallel: false - + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + optimizer: optimizer: "adam" lr: 5.0e-6 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml index 1fd336d0b4..5b2b073691 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml @@ -55,7 +55,9 @@ policy: num_layers_in_first_pipeline_stage: null num_layers_in_last_pipeline_stage: null sequence_parallel: true - + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + optimizer: optimizer: "adam" lr: 5.0e-6 #4.0e-5 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml index 73008f3154..9388e8ed6f 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml @@ -55,7 +55,9 @@ policy: num_layers_in_first_pipeline_stage: null num_layers_in_last_pipeline_stage: null sequence_parallel: true - + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + optimizer: optimizer: "adam" lr: 5.0e-6 #4.0e-5 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml index ddd53920e6..68f0d177cd 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml @@ -43,7 +43,9 @@ policy: num_layers_in_first_pipeline_stage: null num_layers_in_last_pipeline_stage: null sequence_parallel: false - + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + optimizer: optimizer: "adam" lr: 5.0e-6 diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index e3c614e2a7..4127411af6 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -73,6 +73,8 @@ policy: num_layers_in_first_pipeline_stage: null num_layers_in_last_pipeline_stage: null sequence_parallel: false + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True optimizer: optimizer: "adam" diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index 9113723af0..5cee6fcaec 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -466,6 +466,7 @@ def __init__( "a lambda and couldn't be serialized). This is based on this check " "https://github.com/NVIDIA/Megatron-LM/blob/1ab876ddc4c1893c76f26d775226a8d1dcdfb3d2/megatron/core/transformer/mlp.py#L174." ) + model_cfg.apply_rope_fusion = self.cfg["megatron_cfg"]["apply_rope_fusion"] checkpoint_config = CheckpointConfig( save_interval=100, diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index 8371fababb..612630ded0 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -163,6 +163,7 @@ def get_basic_megatron_test_config( "context_parallel_size": 1, "pipeline_dtype": precision, "sequence_parallel": sequence_parallel, + "apply_rope_fusion": True, "optimizer": { "optimizer": "adam", "lr": 5.0e-6, diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 03f41a8b1f..7b56977258 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -79,6 +79,7 @@ def create_megatron_test_config( "context_parallel_size": 1, "pipeline_dtype": precision, "sequence_parallel": sequence_parallel, + "apply_rope_fusion": True, "optimizer": { "optimizer": "adam", "lr": 5.0e-6,