Merge pull request #686 from allenai/fix-from-checkpoint

AkshitaB · web-flow · commit c322b9a3c70f · 2024-08-01T11:45:19.000-07:00
Fixes for OLMo.from_checkpoint
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Fixed restarting a training run in later epochs so that we no longer need to set the flag `--epoch=INT`.
 - Fix bug where the attention norm, when applied before the attention block, was modifying the residual stream.
+- Fixed `OLMo.from_checkpoint()` so that it correctly loads `olmo_core` and `torch_new` style checkpoints.
 
 ## [v0.4.0](https://github.com/allenai/OLMo/releases/tag/v0.4.0) - 2024-07-11
 
diff --git a/olmo/model.py b/olmo/model.py
@@ -42,6 +42,8 @@
     InitFnType,
     LayerNormType,
     ModelConfig,
+    ShardedCheckpointerType,
+    TrainConfig,
 )
 from .exceptions import OLMoConfigurationError
 from .initialization import init_normal
@@ -1740,15 +1742,26 @@ def from_checkpoint(
             model.load_state_dict(model._make_state_dict_compatible(state_dict)[0])
             model = model.to(torch.device(device))
         else:
-            from .checkpoint import load_model_state
+            train_config = TrainConfig.load(config_path)
+            if train_config.sharded_checkpointer == ShardedCheckpointerType.olmo_core:
+                from olmo_core.distributed.checkpoint import (  # type: ignore
+                    load_model_and_optim_state,
+                )
 
-            # Initialize model on target device. In this case the state dict is loaded in-place
-            # so it's not necessary to start on CPU if the target device is a GPU.
-            model_config.init_device = device
-            model = OLMo(model_config)
+                model_config.init_device = device
+                model = OLMo(model_config)
+                load_model_and_optim_state(checkpoint_dir, model)
+            else:
+                # train_config.sharded_checkpointer == ShardedCheckpointerType.torch_new
+                from .checkpoint import load_model_state
+
+                # Initialize model on target device. In this case the state dict is loaded in-place
+                # so it's not necessary to start on CPU if the target device is a GPU.
+                model_config.init_device = device
+                model = OLMo(model_config)
 
-            # Load state dict in place.
-            load_model_state(checkpoint_dir, model)
+                # Load state dict in place.
+                load_model_state(checkpoint_dir, model)
 
         return model.eval()