Merge pull request #717 from allenai/shanea/try-load-latest-save-2

Added ability to try loading the latest checkpoint from save folders
allenai · Sep 3, 2024 · ca81901 · ca81901
2 parents 46f06cb + e18ed7f
commit ca81901
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Added
+
+- Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`.
+
 ## [v0.5.0](https://github.com/allenai/OLMo/releases/tag/v0.5.0) - 2024-08-26
 
 - Fixed conversion to HuggingFace model for DDP-trained models.

diff --git a/olmo/config.py b/olmo/config.py
@@ -1011,7 +1011,7 @@ class TrainConfig(BaseConfig):
 
     load_path: Optional[str] = None
     """
-    The path to a training checkpoint to restore/resume from.
+    The path to a training checkpoint to restore/resume from. If not set, then training begins from scratch.
 
     Note that you can make use of the "path.last_checkpoint" Omegaconfig YAML resolver here, which takes
     a local or remote directory and resolves to the latest checkpoint (sharded or unsharded) in that directory.
@@ -1020,13 +1020,23 @@ class TrainConfig(BaseConfig):
     ```bash
     --load_path='${path.last_checkpoint:s3://ai2-llm/checkpoints/7b/v1_5-mix-run-001}'
     ```
+
+    If `try_load_latest_save` is set and saved checkpoints exist, then `load_path` will be overriden
+    by the latest saved checkpoint.
     """
 
     load_path_sharded_checkpointer: Optional[ShardedCheckpointerType] = None
     """
     The sharded checkpointer type to use to load the initial checkpoint from ``load_path``.
     """
 
+    try_load_latest_save: bool = False
+    """
+    If set, then training will be resumed from the latest checkpoint in the local save folder, falling
+    back to the latest checkpoint in the remote save folder if none exists. If there are no checkpoints
+    in the local and remote save folders, then checkpoint loading will fall back to `load_path`.
+    """
+
     reset_optimizer_state: bool = False
     """
     When this is set, we restore the model from a checkpoint (if given), but we leave the optimizer uninitialized.

diff --git a/scripts/train.py b/scripts/train.py
@@ -41,6 +41,7 @@
 from olmo.util import (
     add_cached_path_clients,
     clean_opt,
+    find_latest_checkpoint,
     log_extra_field,
     prepare_cli_environment,
 )
@@ -239,6 +240,20 @@ def dummy_init_fn(module: torch.nn.Module) -> None:
         evaluators=evaluators,
         indices_file=indices_file,
     ) as trainer:
+        if cfg.try_load_latest_save:
+            if (
+                cfg.save_folder is not None
+                and (checkpoint_dir := find_latest_checkpoint(cfg.save_folder)) is not None
+            ):
+                log.info("Setting load path to local checkpoint %s", checkpoint_dir)
+                cfg.load_path = str(checkpoint_dir)
+            elif (
+                cfg.remote_save_folder is not None
+                and (checkpoint_dir := find_latest_checkpoint(cfg.remote_save_folder)) is not None
+            ):
+                log.info("Setting load path to remote checkpoint %s", checkpoint_dir)
+                cfg.load_path = str(checkpoint_dir)
+
         if not cfg.dry_run and not cfg.no_pre_train_checkpoint and cfg.load_path is None:
             if cfg.distributed_strategy == DistributedStrategy.ddp:
                 checkpoint_type = CheckpointType.unsharded