pytorch · fegin · Feb 6, 2024 · Jan 31, 2024 · Feb 1, 2024 · Feb 2, 2024
diff --git a/run_llama_train.sh b/run_llama_train.sh
@@ -7,6 +7,11 @@ TRAINER_DIR=${1:-/home/$USER/local/torchtrain}
 MODEL="debugmodel"
 NGPU=8
 MP=4
+# Change this string to a meaningful one to enable checkpoint
+CHECKPOINT_FOLDER=""
+# Please adjust this to a longer interval period. The unit of measurement is in steps.
+CHECKPOINT_INTERVAL=5
 
 torchrun --nproc_per_node=${NGPU} \
-train.py --steps 10
+train.py --steps 10 --compile \
+--checkpoint-folder=${CHECKPOINT_FOLDER} --checkpoint-interval=${CHECKPOINT_INTERVAL}
diff --git a/torchtrain/checkpoint.py b/torchtrain/checkpoint.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+import enum
+import os
+import re
+import time
+from typing import Any, Dict
+
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
+import torch.nn as nn
+from torch.distributed.checkpoint.state_dict import (
+    get_model_state_dict,
+    get_optimizer_state_dict,
+    set_model_state_dict,
+    set_optimizer_state_dict,
+)
+from torchtrain.logging_utils import rank0_log
+
+
+class IntervalType(enum.Enum):
+    SECONDS = enum.auto()
+    STEPS = enum.auto()
+
+
+class ModelWrapper:
+    def __init__(self, model: nn.Module) -> None:
+        self.model = model
+
+    def state_dict(self) -> None:
+        return get_model_state_dict(self.model)
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        set_model_state_dict(self.model, state_dict)
+
+
+class OptimizerWrapper:
+    def __init__(self, model: nn.Module, optim: torch.optim.Optimizer) -> None:
+        self.model = model
+        self.optim = optim
+
+    def state_dict(self) -> None:
+        return get_optimizer_state_dict(self.model, self.optim)
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        set_optimizer_state_dict(self.model, self.optim, optim_state_dict=state_dict)
+
+
+class CheckpointManager:
+    def __init__(
+        self,
+        model: nn.Module,
+        optimizer: torch.optim.Optimizer,
+        states: Dict[str, Any],
+        folder: str,
+        interval_type: IntervalType,
+        interval: int,
+    ) -> None:
+        self.folder = folder
+        self.states = states
+        self.states.update(
+            {
+                "model": ModelWrapper(model),
+                "optimizer": OptimizerWrapper(model, optimizer),
+            }
+        )
+        self.interval_type = interval_type
+        self.interval = interval
+        self.begin = 0
+        self.work = None
+        self.pg = dist.new_group(backend="gloo")
+        self.doit = None
+
+    def reset(self) -> None:
+        self.begin = time.monotonic()
+
+    def create_checkpoint_id(self, step: int) -> str:
+        return os.path.join(self.folder, f"step-{step}")
+
+    def save(self, curr_step: int, force: bool = False) -> None:
+        if not self.folder:
+            return
+
+        if not force:
+            if self.interval_type == IntervalType.STEPS and not (
+                curr_step % self.interval == 0
+            ):
+                return
+            if self.interval_type == IntervalType.SECONDS:
+                doit = (time.monotonic() - self.begin) >= self.interval
+                self.doit = torch.tensor(int(doit))
+                if self.work is None:
+                    self.work = dist.all_reduce(self.doit, group=self.pg, async_op=True)
+                    return
+                elif curr_step % 5 == 4:
+                    self.work.wait()
+                    self.work = None
+                    doit = self.doit.item()
+                    self.doit = None
+                    if doit == 0:
+                        return
+                else:
+                    return
+
+        if self.work:
+            self.work.wait()
+            self.work = None
+            self.doit = None
+
+        rank0_log(f"Saving a checkpoint in step {curr_step}.")
+        begin = time.monotonic()
+        dcp.save(self.states, checkpoint_id=self.create_checkpoint_id(curr_step))
+        self.reset()
+        rank0_log(
+            f"Finish saving the checkpoint in step {curr_step}. "
+            f"{time.monotonic() - begin} seconds"
+        )
+
+    def load(self, step: int = -1) -> bool:
+        if not self.folder:
+            return False
+        if not os.path.isdir(self.folder):
+            return False
+        if step != -1 and not os.path.isdir(self.create_checkpoint_id(step)):
+            return False
+
+        if step == -1:
+            step_counts = []
+            for filename in os.listdir(self.folder):
+                match = re.search(r"step-(\d+)", filename)
+                if match:
+                    step_counts.append(int(match.group(1)))
+            if not step_counts:
+                return False
+            step = max(step_counts)
+
+        rank0_log("Loading a checkpoint.")
+        begin = time.monotonic()
+        dcp.load(
+            self.states,
+            checkpoint_id=self.create_checkpoint_id(step),
+        )
+        rank0_log(f"Finish loading a checkpoint. {time.monotonic() - begin} seconds.")
+        return True
diff --git a/train.py b/train.py
@@ -4,14 +4,16 @@
 import argparse
 import os
 from dataclasses import dataclass, field
-from typing import List, Union
+from typing import Any, Dict, List, Union
 
 # torch imports
 import torch
 import torch.nn.functional as F
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
 
+from torchtrain.checkpoint import CheckpointManager, IntervalType
+
 # torchtrain related
 from torchtrain.datasets import create_tokenizer, dataloader_fn
 from torchtrain.logging_utils import init_logger, rank0_log
@@ -29,6 +31,18 @@ class TrainState:
     current_loss: float = -1
     losses: List[float] = field(default_factory=list)
 
+    def state_dict(self) -> Dict[str, Any]:
+        return {
+            "step": torch.tensor(self.step, dtype=torch.int32),
+            "current_loss": torch.tensor(self.current_loss, dtype=torch.float32),
+            "losses": torch.tensor(self.current_loss, dtype=torch.float32),
+        }
+
+    def load_state_dict(self, state_dict) -> None:
+        self.step = state_dict["step"].item()
+        self.current_loss = state_dict["current_loss"].item()
+        self.losses = state_dict["losses"].tolist()
+
 
 def build_optimizer(model, args):
     # build optimizer
@@ -116,7 +130,22 @@ def main(args):
     # train loop
     model.train()
 
+    checkpoint = CheckpointManager(
+        model=model,
+        optimizer=optimizer,
+        states={"train_state": train_state},
+        folder=args.checkpoint_folder,
+        interval_type=(
+            IntervalType.SECONDS
+            if args.checkpoint_interval_type == "seconds"
+            else IntervalType.STEPS
+        ),
+        interval=args.checkpoint_interval,
+    )
+    checkpoint.load()
+
     with maybe_run_profiler() as torch_profiler:
+        checkpoint.reset()
         while train_state.step < args.steps or args.steps == -1:
             train_state.step += 1
             # get batch
@@ -161,6 +190,8 @@ def main(args):
             )
             scheduler.step()
 
+            checkpoint.save(train_state.step, force=(train_state.step == args.steps))
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="TorchTrain arg parser.")
@@ -224,6 +255,33 @@ def main(args):
     parser.add_argument(
         "--compile", action="store_true", help="Whether to compile the model."
     )
+    parser.add_argument(
+        "--checkpoint-interval",
+        type=int,
+        default=3600,
+        help=(
+            "Checkpointing interval. The unit of measurement is in seconds or "
+            "steps depending on --checkpoint-internval-type."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoint-interval-type",
+        type=str,
+        default="steps",
+        help=(
+            "The checkpointing interval unit of measurement."
+            "The default value is step."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoint-folder",
+        type=str,
+        default="",
+        help=(
+            "The folder to store the checkpoints. If this is not specified or "
+            "is an empty string, checkpointing is disabled."
+        ),
+    )
 
     args = parser.parse_args()
     main(args)