diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml
index db6fb7fa6d..25f5b59ef2 100755
--- a/examples/configs/dpo.yaml
+++ b/examples/configs/dpo.yaml
@@ -98,7 +98,9 @@ policy:
     num_layers_in_first_pipeline_stage: null
     num_layers_in_last_pipeline_stage: null
     sequence_parallel: true
-
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+    
     optimizer:
       optimizer: "adam"
       lr: 5.0e-6 #4.0e-5
diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
index 237fbb0df1..e9bcbf20b8 100644
--- a/examples/configs/grpo_math_1B_megatron.yaml
+++ b/examples/configs/grpo_math_1B_megatron.yaml
@@ -79,7 +79,9 @@ policy:
     context_parallel_size: 1
     pipeline_dtype: ${policy.precision}
     sequence_parallel: false
-
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+    
     optimizer:
       optimizer: "adam"
       lr: 5.0e-6
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml
index 1fd336d0b4..5b2b073691 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml
@@ -55,7 +55,9 @@ policy:
     num_layers_in_first_pipeline_stage: null
     num_layers_in_last_pipeline_stage: null
     sequence_parallel: true
-
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+    
     optimizer:
       optimizer: "adam"
       lr: 5.0e-6 #4.0e-5
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
index 73008f3154..9388e8ed6f 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
@@ -55,7 +55,9 @@ policy:
     num_layers_in_first_pipeline_stage: null
     num_layers_in_last_pipeline_stage: null
     sequence_parallel: true
-
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+    
     optimizer:
       optimizer: "adam"
       lr: 5.0e-6 #4.0e-5
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml
index ddd53920e6..68f0d177cd 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml
@@ -43,7 +43,9 @@ policy:
     num_layers_in_first_pipeline_stage: null
     num_layers_in_last_pipeline_stage: null
     sequence_parallel: false
-
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+    
     optimizer:
       optimizer: "adam"
       lr: 5.0e-6
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index e3c614e2a7..4127411af6 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -73,6 +73,8 @@ policy:
     num_layers_in_first_pipeline_stage: null
     num_layers_in_last_pipeline_stage: null
     sequence_parallel: false
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True   
 
     optimizer:
       optimizer: "adam"
diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index 9113723af0..5cee6fcaec 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -466,6 +466,7 @@ def __init__(
                 "a lambda and couldn't be serialized). This is based on this check "
                 "https://github.com/NVIDIA/Megatron-LM/blob/1ab876ddc4c1893c76f26d775226a8d1dcdfb3d2/megatron/core/transformer/mlp.py#L174."
             )
+        model_cfg.apply_rope_fusion = self.cfg["megatron_cfg"]["apply_rope_fusion"]
 
         checkpoint_config = CheckpointConfig(
             save_interval=100,
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index 8371fababb..612630ded0 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -163,6 +163,7 @@ def get_basic_megatron_test_config(
             "context_parallel_size": 1,
             "pipeline_dtype": precision,
             "sequence_parallel": sequence_parallel,
+            "apply_rope_fusion": True,
             "optimizer": {
                 "optimizer": "adam",
                 "lr": 5.0e-6,
diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index 03f41a8b1f..7b56977258 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -79,6 +79,7 @@ def create_megatron_test_config(
             "context_parallel_size": 1,
             "pipeline_dtype": precision,
             "sequence_parallel": sequence_parallel,
+            "apply_rope_fusion": True,
             "optimizer": {
                 "optimizer": "adam",
                 "lr": 5.0e-6,