From 16f66702bff18b26146ce9883caec3a4e49acad5 Mon Sep 17 00:00:00 2001 From: wesleytruong Date: Wed, 6 Aug 2025 16:23:11 -0700 Subject: [PATCH] Reorder validate and checkpoint in train If validation and checkpoint occur on the same training step, do checkpointing first --- torchtitan/train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchtitan/train.py b/torchtitan/train.py index 807dea8bc5..f55530a083 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -575,6 +575,10 @@ def train(self): logger.warning("Ran out of data; last step was canceled.") break + self.checkpointer.save( + self.step, last_step=(self.step == job_config.training.steps) + ) + # Run validation if validator is available if ( self.job_config.validation.enabled @@ -582,10 +586,6 @@ def train(self): ): self.validator.validate(self.model_parts, self.step) - self.checkpointer.save( - self.step, last_step=(self.step == job_config.training.steps) - ) - # signal the profiler that the next profiling step has started if torch_profiler: torch_profiler.step()