Skip to content

Commit

Permalink
Merge pull request #717 from allenai/shanea/try-load-latest-save-2
Browse files Browse the repository at this point in the history
Added ability to try loading the latest checkpoint from save folders
  • Loading branch information
2015aroras authored Sep 3, 2024
2 parents 46f06cb + e18ed7f commit ca81901
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

### Added

- Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`.

## [v0.5.0](https://github.com/allenai/OLMo/releases/tag/v0.5.0) - 2024-08-26

- Fixed conversion to HuggingFace model for DDP-trained models.
Expand Down
12 changes: 11 additions & 1 deletion olmo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1011,7 +1011,7 @@ class TrainConfig(BaseConfig):

load_path: Optional[str] = None
"""
The path to a training checkpoint to restore/resume from.
The path to a training checkpoint to restore/resume from. If not set, then training begins from scratch.
Note that you can make use of the "path.last_checkpoint" Omegaconfig YAML resolver here, which takes
a local or remote directory and resolves to the latest checkpoint (sharded or unsharded) in that directory.
Expand All @@ -1020,13 +1020,23 @@ class TrainConfig(BaseConfig):
```bash
--load_path='${path.last_checkpoint:s3://ai2-llm/checkpoints/7b/v1_5-mix-run-001}'
```
If `try_load_latest_save` is set and saved checkpoints exist, then `load_path` will be overriden
by the latest saved checkpoint.
"""

load_path_sharded_checkpointer: Optional[ShardedCheckpointerType] = None
"""
The sharded checkpointer type to use to load the initial checkpoint from ``load_path``.
"""

try_load_latest_save: bool = False
"""
If set, then training will be resumed from the latest checkpoint in the local save folder, falling
back to the latest checkpoint in the remote save folder if none exists. If there are no checkpoints
in the local and remote save folders, then checkpoint loading will fall back to `load_path`.
"""

reset_optimizer_state: bool = False
"""
When this is set, we restore the model from a checkpoint (if given), but we leave the optimizer uninitialized.
Expand Down
15 changes: 15 additions & 0 deletions scripts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from olmo.util import (
add_cached_path_clients,
clean_opt,
find_latest_checkpoint,
log_extra_field,
prepare_cli_environment,
)
Expand Down Expand Up @@ -239,6 +240,20 @@ def dummy_init_fn(module: torch.nn.Module) -> None:
evaluators=evaluators,
indices_file=indices_file,
) as trainer:
if cfg.try_load_latest_save:
if (
cfg.save_folder is not None
and (checkpoint_dir := find_latest_checkpoint(cfg.save_folder)) is not None
):
log.info("Setting load path to local checkpoint %s", checkpoint_dir)
cfg.load_path = str(checkpoint_dir)
elif (
cfg.remote_save_folder is not None
and (checkpoint_dir := find_latest_checkpoint(cfg.remote_save_folder)) is not None
):
log.info("Setting load path to remote checkpoint %s", checkpoint_dir)
cfg.load_path = str(checkpoint_dir)

if not cfg.dry_run and not cfg.no_pre_train_checkpoint and cfg.load_path is None:
if cfg.distributed_strategy == DistributedStrategy.ddp:
checkpoint_type = CheckpointType.unsharded
Expand Down

0 comments on commit ca81901

Please sign in to comment.