From d1a83a1fe868176288c922673608f1bfa3a0ba83 Mon Sep 17 00:00:00 2001 From: Shane A Date: Thu, 29 Aug 2024 17:24:39 -0700 Subject: [PATCH 1/4] Add config option for trying to load latest saved checkpoint --- olmo/config.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/olmo/config.py b/olmo/config.py index ae454bb19..885490ce5 100644 --- a/olmo/config.py +++ b/olmo/config.py @@ -1027,6 +1027,14 @@ class TrainConfig(BaseConfig): The sharded checkpointer type to use to load the initial checkpoint from ``load_path``. """ + try_load_latest_save: bool = False + """ + If set and `load_path` is not set, then training will be resumed from the latest checkpoint + in the local save folder, falling back to the latest checkpoint in the remote save folder if none + exists. If there are no checkpoints in the local and remote save folders, then the model will be + initialized from scratch. + """ + reset_optimizer_state: bool = False """ When this is set, we restore the model from a checkpoint (if given), but we leave the optimizer uninitialized. From b39cc7b9527b6ea950dcb89713102954b844f108 Mon Sep 17 00:00:00 2001 From: Shane A Date: Thu, 29 Aug 2024 17:25:03 -0700 Subject: [PATCH 2/4] Implement logic for trying to load latest saved checkpoint --- scripts/train.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scripts/train.py b/scripts/train.py index 1f7353095..54f6dcad5 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -41,6 +41,7 @@ from olmo.util import ( add_cached_path_clients, clean_opt, + find_latest_checkpoint, log_extra_field, prepare_cli_environment, ) @@ -239,6 +240,20 @@ def dummy_init_fn(module: torch.nn.Module) -> None: evaluators=evaluators, indices_file=indices_file, ) as trainer: + if cfg.try_load_latest_save and cfg.load_path is None: + if ( + cfg.save_folder is not None + and (checkpoint_dir := find_latest_checkpoint(cfg.save_folder)) is not None + ): + log.info("Setting load path to local checkpoint %s", checkpoint_dir) + cfg.load_path = str(checkpoint_dir) + elif ( + cfg.remote_save_folder is not None + and (checkpoint_dir := find_latest_checkpoint(cfg.remote_save_folder)) is not None + ): + log.info("Setting load path to remote checkpoint %s", checkpoint_dir) + cfg.load_path = str(checkpoint_dir) + if not cfg.dry_run and not cfg.no_pre_train_checkpoint and cfg.load_path is None: if cfg.distributed_strategy == DistributedStrategy.ddp: checkpoint_type = CheckpointType.unsharded From 212cf47a2c602cb3fa28e643284f2eba3147e675 Mon Sep 17 00:00:00 2001 From: Shane A Date: Thu, 29 Aug 2024 17:27:15 -0700 Subject: [PATCH 3/4] Update CHANGELOG --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 61f32b16a..d680618e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Added + +- Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`. + ## [v0.5.0](https://github.com/allenai/OLMo/releases/tag/v0.5.0) - 2024-08-26 - Fixed conversion to HuggingFace model for DDP-trained models. From e18ed7f54d8fcc9f83727f271475cc95b2019ab1 Mon Sep 17 00:00:00 2001 From: Shane A Date: Thu, 29 Aug 2024 17:42:46 -0700 Subject: [PATCH 4/4] Make try_load_latest_save override load_path --- olmo/config.py | 12 +++++++----- scripts/train.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/olmo/config.py b/olmo/config.py index 885490ce5..94e5103d2 100644 --- a/olmo/config.py +++ b/olmo/config.py @@ -1011,7 +1011,7 @@ class TrainConfig(BaseConfig): load_path: Optional[str] = None """ - The path to a training checkpoint to restore/resume from. + The path to a training checkpoint to restore/resume from. If not set, then training begins from scratch. Note that you can make use of the "path.last_checkpoint" Omegaconfig YAML resolver here, which takes a local or remote directory and resolves to the latest checkpoint (sharded or unsharded) in that directory. @@ -1020,6 +1020,9 @@ class TrainConfig(BaseConfig): ```bash --load_path='${path.last_checkpoint:s3://ai2-llm/checkpoints/7b/v1_5-mix-run-001}' ``` + + If `try_load_latest_save` is set and saved checkpoints exist, then `load_path` will be overriden + by the latest saved checkpoint. """ load_path_sharded_checkpointer: Optional[ShardedCheckpointerType] = None @@ -1029,10 +1032,9 @@ class TrainConfig(BaseConfig): try_load_latest_save: bool = False """ - If set and `load_path` is not set, then training will be resumed from the latest checkpoint - in the local save folder, falling back to the latest checkpoint in the remote save folder if none - exists. If there are no checkpoints in the local and remote save folders, then the model will be - initialized from scratch. + If set, then training will be resumed from the latest checkpoint in the local save folder, falling + back to the latest checkpoint in the remote save folder if none exists. If there are no checkpoints + in the local and remote save folders, then checkpoint loading will fall back to `load_path`. """ reset_optimizer_state: bool = False diff --git a/scripts/train.py b/scripts/train.py index 54f6dcad5..4ccb41607 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -240,7 +240,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: evaluators=evaluators, indices_file=indices_file, ) as trainer: - if cfg.try_load_latest_save and cfg.load_path is None: + if cfg.try_load_latest_save: if ( cfg.save_folder is not None and (checkpoint_dir := find_latest_checkpoint(cfg.save_folder)) is not None