From 638c27c04800afd91dff1f3048a3003880c9cb21 Mon Sep 17 00:00:00 2001 From: Tushar Jain Date: Mon, 28 Jul 2025 11:34:30 -0700 Subject: [PATCH 1/2] remove dead code Summary: remove some stale code that determines parameters to pass to outer optimizer --- torchtitan/components/ft.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/torchtitan/components/ft.py b/torchtitan/components/ft.py index 70b814f3aa..76f2da3ae5 100644 --- a/torchtitan/components/ft.py +++ b/torchtitan/components/ft.py @@ -123,8 +123,6 @@ def maybe_semi_sync_training( ), "FTManager must be enabled to use semi-sync training." if semi_sync_method.lower() == "diloco": # Create the outer optimizer based on the inner optimizer parameters. - params = [group["params"] for group in optimizer.param_groups] - params = [param for sublist in params for param in sublist] outer_optimizers = [] for model in model_parts: params = [p for p in model.parameters() if p.requires_grad] From a0655348a607c72cf156e9f0dc15a6422a97472d Mon Sep 17 00:00:00 2001 From: Tushar Jain Date: Mon, 28 Jul 2025 20:44:03 -0700 Subject: [PATCH 2/2] fix creating leaf folder Summary: the leaf folder wasn't being created so and no profiles were being written, so create it if it doesn't exist --- torchtitan/tools/profiling.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/torchtitan/tools/profiling.py b/torchtitan/tools/profiling.py index 843c13a746..0e851d335a 100644 --- a/torchtitan/tools/profiling.py +++ b/torchtitan/tools/profiling.py @@ -40,16 +40,14 @@ def maybe_enable_profiling( def trace_handler(prof): curr_trace_dir_name = "iteration_" + str(prof.step_num) - curr_trace_dir = os.path.join(trace_dir, curr_trace_dir_name) + curr_trace_dir = os.path.join(trace_dir, curr_trace_dir_name, leaf_folder) if not os.path.exists(curr_trace_dir): os.makedirs(curr_trace_dir, exist_ok=True) logger.info(f"Dumping profiler traces at step {prof.step_num}") begin = time.monotonic() - output_file = os.path.join( - curr_trace_dir, leaf_folder, f"rank{rank}_trace.json" - ) + output_file = os.path.join(curr_trace_dir, f"rank{rank}_trace.json") prof.export_chrome_trace(output_file) logger.info( f"Finished dumping profiler traces in {time.monotonic() - begin:.2f} seconds" @@ -123,13 +121,13 @@ def step(self, exit_ctx: bool = False): # dump as iteration_0_exit if OOM at iter 1 curr_step = self.step_num - 1 dir_name = f"iteration_{curr_step}_exit" - curr_snapshot_dir = os.path.join(snapshot_dir, dir_name) + curr_snapshot_dir = os.path.join(snapshot_dir, dir_name, leaf_folder) if not os.path.exists(curr_snapshot_dir): os.makedirs(curr_snapshot_dir, exist_ok=True) logger.info(f"Dumping memory snapshot at step {curr_step}") begin = time.monotonic() output_file = os.path.join( - curr_snapshot_dir, leaf_folder, f"rank{rank}_memory_snapshot.pickle" + curr_snapshot_dir, f"rank{rank}_memory_snapshot.pickle" ) with open(output_file, "wb") as output: pickle.dump(torch.cuda.memory._snapshot(), output)