pytorch
diff --git a/‎docs/debugging.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/debugging.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/unit_tests/test_activation_checkpoint.py‎
Lines changed: 28 additions & 34 deletions b/‎tests/unit_tests/test_activation_checkpoint.py‎
Lines changed: 28 additions & 34 deletions
diff --git a/‎torchtitan/config/job_config.py‎
Lines changed: 20 additions & 9 deletions b/‎torchtitan/config/job_config.py‎
Lines changed: 20 additions & 9 deletions
@@ -99,11 +99,11 @@ Use `--debug.deterministic_warn_only` to only warn about (not stop running) kern
 
 The following debug configs are available for AC.
 
-`ac_preserve_rng_state` - if deterministic output compared to non-checkpointed passes is required, set to true. Results in stashing and restoring the RNG state during each checkpoint, may be slower.
+`preserve_rng_state` - if deterministic output compared to non-checkpointed passes is required, set to true. Results in stashing and restoring the RNG state during each checkpoint, may be slower.
 
-`ac_determinism_check` - A string specifying the determinism function
+`determinism_check` - A string specifying the determinism function
 
-`ac_debug` - capture ac debug information. Will be slower.
+`debug` - capture ac debug information. Will be slower.
 
 See https://docs.pytorch.org/docs/stable/checkpoint.html for details.
 
 
@@ -11,7 +11,6 @@
 from torch.utils.flop_counter import FlopCounterMode
 
 from torchtitan.config.job_config import ActivationCheckpoint as ACConfig
-from torchtitan.config.job_config import JobConfig
 from torchtitan.distributed.activation_checkpoint import apply_ac
 
 
@@ -75,16 +74,15 @@ def get_bw_flops(model_fn):
         # 2. SAC
         # Per-op SAC's policy is to save every other mm
         model_selective_ac = ToyModule()
-        job_config = JobConfig()
-        job_config.activation_checkpoint = ACConfig(
+        ac_config_no_force = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=[],  # Empty list
             early_stop=False,
         )
         apply_ac(
             model_selective_ac,
-            job_config,
+            ac_config_no_force,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -94,15 +92,15 @@ def get_bw_flops(model_fn):
         # 3. Per-op SAC with force recompute "moe.router.gate"
         # This leads to two mms being recomputed since they share the same shape!
         model_with_force_first = ToyModule()
-        job_config.activation_checkpoint = ACConfig(
+        ac_config_with_force_first = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=["moe.router.gate"],
             early_stop=False,
         )
         apply_ac(
             model_with_force_first,
-            job_config,
+            ac_config_with_force_first,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -111,15 +109,15 @@ def get_bw_flops(model_fn):
 
         # 4. Per-op SAC with force recompute "output"
         model_with_force_last = ToyModule()
-        job_config.activation_checkpoint = ACConfig(
+        ac_config_with_force_last = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=["output"],
             early_stop=False,
         )
         apply_ac(
             model_with_force_last,
-            job_config,
+            ac_config_with_force_last,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -128,13 +126,13 @@ def get_bw_flops(model_fn):
 
         # 5. Full AC
         model_with_full_ac = ToyModule()
-        job_config.activation_checkpoint = ACConfig(
+        ac_config_full_ac = ACConfig(
             mode="full",
             early_stop=False,
         )
         apply_ac(
             model_with_full_ac,
-            job_config,
+            ac_config_full_ac,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -170,14 +168,14 @@ def get_act_mem(model_fn):
         # 2. SAC
         # Per-op SAC's policy is to save every other mm
         model_selective_ac = ToyModule().cuda()
-        job_config.activation_checkpoint = ACConfig(
+        ac_config_no_force = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=[],  # Empty list
         )
         apply_ac(
             model_selective_ac,
-            job_config,
+            ac_config_no_force,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -187,14 +185,14 @@ def get_act_mem(model_fn):
         # 3. Per-op SAC with force recompute "moe.router.gate"
         # This leads to two mms being recomputed since they share the same shape!
         model_with_force_first = ToyModule().cuda()
-        job_config.activation_checkpoint = ACConfig(
+        ac_config_with_force_first = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=["moe.router.gate"],
         )
         apply_ac(
             model_with_force_first,
-            job_config,
+            ac_config_with_force_first,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -203,14 +201,14 @@ def get_act_mem(model_fn):
 
         # 4. Per-op SAC with force recompute "output"
         model_with_force_last = ToyModule().cuda()
-        job_config.activation_checkpoint = ACConfig(
+        ac_config_with_force_last = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=["output"],
         )
         apply_ac(
             model_with_force_last,
-            job_config,
+            ac_config_with_force_last,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -219,12 +217,12 @@ def get_act_mem(model_fn):
 
         # 5. Full AC
         model_with_full_ac = ToyModule().cuda()
-        job_config.activation_checkpoint = ACConfig(
+        ac_config_full_ac = ACConfig(
             mode="full",
         )
         apply_ac(
             model_with_full_ac,
-            job_config,
+            ac_config_full_ac,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -245,44 +243,40 @@ def test_correctness(self):
 
         model_selective_ac = ToyModule()
         model_selective_ac.load_state_dict(model_no_ac.state_dict())
-        job_config = JobConfig()
-        job_config.activation_checkpoint = ACConfig(
+        apply_ac(
+            model_selective_ac,
+            ACConfig(
                 mode="selective",
                 selective_ac_option="op",
                 per_op_sac_force_recompute_mm_shapes_by_fqns=[],
-        )
-        apply_ac(
-            model_selective_ac,
-            job_config,
+            ),
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
         )
         model_force_first = ToyModule()
         model_force_first.load_state_dict(model_no_ac.state_dict())
-        job_config.activation_checkpoint =  ACConfig(
+        apply_ac(
+            model_force_first,
+            ACConfig(
                 mode="selective",
                 selective_ac_option="op",
                 per_op_sac_force_recompute_mm_shapes_by_fqns=["moe.router.gate"],
-        )
-        apply_ac(
-            model_force_first,
-            job_config,
+            ),
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
         )
 
         model_force_last = ToyModule()
         model_force_last.load_state_dict(model_no_ac.state_dict())
-        job_config.activation_checkpoint = ACConfig(
+        apply_ac(
+            model_force_last,
+            ACConfig(
                 mode="selective",
                 selective_ac_option="op",
                 per_op_sac_force_recompute_mm_shapes_by_fqns=["output"],
-        )
-        apply_ac(
-            model_force_last,
-            job_config,
+            ),
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
 
@@ -623,6 +623,26 @@ class ActivationCheckpoint:
     https://github.com/pytorch/pytorch/pull/126320#discussion_r1625104015
     """
 
+    preserve_rng_state: bool = False
+    """
+    If deterministic output compared to non-checkpointed passes is required, set
+    to true. Results in stashing and restoring the RNG state during each checkpoint,
+    may be slower. See https://docs.pytorch.org/docs/stable/checkpoint.html
+    for details.
+    """
+
+    determinism_check: str = "default"
+    """
+    A string specifying the determinism function. See
+    https://docs.pytorch.org/docs/stable/checkpoint.html for details.
+    """
+
+    debug: bool = False
+    """
+    Capture ac debug information. Will be slower. See
+    https://docs.pytorch.org/docs/stable/checkpoint.html for details.
+    """
+
 
 @dataclass
 class Compile:
@@ -882,15 +902,6 @@ class Debug:
     deterministic_warn_only: bool = False
     """Only warns about ops without deterministic implementations rather than erroring out  """
 
-    ac_preserve_rng_state: bool = False
-    """If deterministic output compared to non-checkpointed passes is required, set to true. Results in stashing and restoring the RNG state during each checkpoint, may be slower. See https://docs.pytorch.org/docs/stable/checkpoint.html for details."""
-
-    ac_determinism_check: str = "default"
-    """A string specifying the determinism function. See https://docs.pytorch.org/docs/stable/checkpoint.html for details."""
-
-    ac_debug: bool = False
-    """ Capture ac debug information. Will be slower. See https://docs.pytorch.org/docs/stable/checkpoint.html for details."""
-
     moe_force_load_balance: bool = False
     """If True, we force each experts to get the same amount of tokens via round-robin. This option is for debugging usage only."""