From d1a83a1fe868176288c922673608f1bfa3a0ba83 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Thu, 29 Aug 2024 17:24:39 -0700
Subject: [PATCH 1/4] Add config option for trying to load latest saved
 checkpoint

---
 olmo/config.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/olmo/config.py b/olmo/config.py
index ae454bb19..885490ce5 100644
--- a/olmo/config.py
+++ b/olmo/config.py
@@ -1027,6 +1027,14 @@ class TrainConfig(BaseConfig):
     The sharded checkpointer type to use to load the initial checkpoint from ``load_path``.
     """
 
+    try_load_latest_save: bool = False
+    """
+    If set and `load_path` is not set, then training will be resumed from the latest checkpoint
+    in the local save folder, falling back to the latest checkpoint in the remote save folder if none
+    exists. If there are no checkpoints in the local and remote save folders, then the model will be
+    initialized from scratch.
+    """
+
     reset_optimizer_state: bool = False
     """
     When this is set, we restore the model from a checkpoint (if given), but we leave the optimizer uninitialized.

From b39cc7b9527b6ea950dcb89713102954b844f108 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Thu, 29 Aug 2024 17:25:03 -0700
Subject: [PATCH 2/4] Implement logic for trying to load latest saved
 checkpoint

---
 scripts/train.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/scripts/train.py b/scripts/train.py
index 1f7353095..54f6dcad5 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -41,6 +41,7 @@
 from olmo.util import (
     add_cached_path_clients,
     clean_opt,
+    find_latest_checkpoint,
     log_extra_field,
     prepare_cli_environment,
 )
@@ -239,6 +240,20 @@ def dummy_init_fn(module: torch.nn.Module) -> None:
         evaluators=evaluators,
         indices_file=indices_file,
     ) as trainer:
+        if cfg.try_load_latest_save and cfg.load_path is None:
+            if (
+                cfg.save_folder is not None
+                and (checkpoint_dir := find_latest_checkpoint(cfg.save_folder)) is not None
+            ):
+                log.info("Setting load path to local checkpoint %s", checkpoint_dir)
+                cfg.load_path = str(checkpoint_dir)
+            elif (
+                cfg.remote_save_folder is not None
+                and (checkpoint_dir := find_latest_checkpoint(cfg.remote_save_folder)) is not None
+            ):
+                log.info("Setting load path to remote checkpoint %s", checkpoint_dir)
+                cfg.load_path = str(checkpoint_dir)
+
         if not cfg.dry_run and not cfg.no_pre_train_checkpoint and cfg.load_path is None:
             if cfg.distributed_strategy == DistributedStrategy.ddp:
                 checkpoint_type = CheckpointType.unsharded

From 212cf47a2c602cb3fa28e643284f2eba3147e675 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Thu, 29 Aug 2024 17:27:15 -0700
Subject: [PATCH 3/4] Update CHANGELOG

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 61f32b16a..d680618e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Added
+
+- Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`.
+
 ## [v0.5.0](https://github.com/allenai/OLMo/releases/tag/v0.5.0) - 2024-08-26
 
 - Fixed conversion to HuggingFace model for DDP-trained models.

From e18ed7f54d8fcc9f83727f271475cc95b2019ab1 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Thu, 29 Aug 2024 17:42:46 -0700
Subject: [PATCH 4/4] Make try_load_latest_save override load_path

---
 olmo/config.py   | 12 +++++++-----
 scripts/train.py |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/olmo/config.py b/olmo/config.py
index 885490ce5..94e5103d2 100644
--- a/olmo/config.py
+++ b/olmo/config.py
@@ -1011,7 +1011,7 @@ class TrainConfig(BaseConfig):
 
     load_path: Optional[str] = None
     """
-    The path to a training checkpoint to restore/resume from.
+    The path to a training checkpoint to restore/resume from. If not set, then training begins from scratch.
 
     Note that you can make use of the "path.last_checkpoint" Omegaconfig YAML resolver here, which takes
     a local or remote directory and resolves to the latest checkpoint (sharded or unsharded) in that directory.
@@ -1020,6 +1020,9 @@ class TrainConfig(BaseConfig):
     ```bash
     --load_path='${path.last_checkpoint:s3://ai2-llm/checkpoints/7b/v1_5-mix-run-001}'
     ```
+
+    If `try_load_latest_save` is set and saved checkpoints exist, then `load_path` will be overriden
+    by the latest saved checkpoint.
     """
 
     load_path_sharded_checkpointer: Optional[ShardedCheckpointerType] = None
@@ -1029,10 +1032,9 @@ class TrainConfig(BaseConfig):
 
     try_load_latest_save: bool = False
     """
-    If set and `load_path` is not set, then training will be resumed from the latest checkpoint
-    in the local save folder, falling back to the latest checkpoint in the remote save folder if none
-    exists. If there are no checkpoints in the local and remote save folders, then the model will be
-    initialized from scratch.
+    If set, then training will be resumed from the latest checkpoint in the local save folder, falling
+    back to the latest checkpoint in the remote save folder if none exists. If there are no checkpoints
+    in the local and remote save folders, then checkpoint loading will fall back to `load_path`.
     """
 
     reset_optimizer_state: bool = False
diff --git a/scripts/train.py b/scripts/train.py
index 54f6dcad5..4ccb41607 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -240,7 +240,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None:
         evaluators=evaluators,
         indices_file=indices_file,
     ) as trainer:
-        if cfg.try_load_latest_save and cfg.load_path is None:
+        if cfg.try_load_latest_save:
             if (
                 cfg.save_folder is not None
                 and (checkpoint_dir := find_latest_checkpoint(cfg.save_folder)) is not None