Skip to content

Commit

Permalink
update release_grads
Browse files Browse the repository at this point in the history
  • Loading branch information
DesmonDay committed Jul 30, 2024
1 parent 96205eb commit a218bf8
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 4 deletions.
9 changes: 5 additions & 4 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,11 +1062,12 @@ def fused_allreduce_gradients_no_sync(paramlist, hcg):
if optimizer_was_run:
self.lr_scheduler.step()

if enable_release_grads and args.pipeline_parallel_degree > 1:
if args.release_grads or enable_release_grads:
self.optimizer.clear_grad(set_to_zero=False)
for _, buffers in model._chunk_2_comm_buffers.items():
for buffer in buffers:
buffer._clear_grad_storage()
if args.pipeline_parallel_degree > 1:
for _, buffers in model._chunk_2_comm_buffers.items():
for buffer in buffers:
buffer._clear_grad_storage()
else:
self.optimizer.clear_grad()

Expand Down
5 changes: 5 additions & 0 deletions paddlenlp/trainer/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,8 @@ class TrainingArguments:
Whether skip profile timer, timer will record time usage of forward/ backward/ step, etc.
distributed_dataloader (`bool`, *optional*):
Whether to use distributed dataloader. Default is `False`.
release_grads (`bool`, *optional*):
Whether to release gradients during training. Default is `False`.
"""

output_dir: str = field(
Expand Down Expand Up @@ -791,6 +793,9 @@ class TrainingArguments:
default=False,
metadata={"help": "Enable MoE (Mixture of Experts) expert parallel training"},
)
release_grads: Optional[bool] = field(
default=False, metadata={"help": "Whether to release gradients during training. Default is `False`."}
)

def __post_init__(self):
env_local_rank = int(os.environ.get("PADDLE_RANK_IN_NODE", -1))
Expand Down

0 comments on commit a218bf8

Please sign in to comment.