Skip to content

Commit

Permalink
[Unified Checkpoint] Update async save info (PaddlePaddle#8981)
Browse files Browse the repository at this point in the history
* update optimizer async save signal

* update async save info
  • Loading branch information
DesmonDay authored Aug 21, 2024
1 parent 57799fb commit 3e87308
Showing 1 changed file with 19 additions and 0 deletions.
19 changes: 19 additions & 0 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import collections
import contextlib
import inspect
import json
import math
import os
import random
Expand Down Expand Up @@ -2415,6 +2416,24 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None, merge_tensor_
# Save a trained model and configuration using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`

local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
if (
strtobool(os.getenv("FLAG_LLM_PDC", "False"))
and local_rank == 0
and self.args.unified_checkpoint
and "async_save" in self.args.unified_checkpoint_config
):
os.makedirs(self.args.logging_dir, exist_ok=True)
world_size = paddle.distributed.get_world_size()
save_info = {
"world_size": world_size,
"ignore_save_lr_and_optim": self.args.ignore_save_lr_and_optim,
"skip_save_model_weight": "skip_save_model_weight" in self.args.unified_checkpoint_config,
}
if not os.path.exists(os.path.join(self.args.logging_dir, "async_save_info.json")):
with open(os.path.join(self.args.logging_dir, "async_save_info.json"), "w") as f:
json.dump(save_info, f)

if self.args.should_save:
if self.tokenizer is not None:
self.tokenizer.save_pretrained(output_dir)
Expand Down

0 comments on commit 3e87308

Please sign in to comment.