From 36f5ede2aeb5ee2b79ff82da37262ca7e4d58d66 Mon Sep 17 00:00:00 2001 From: Moshe Berchansky Date: Wed, 22 Feb 2023 12:42:55 +0200 Subject: [PATCH 1/2] Fix resume_from_checkpoint for deepspeed Fix resume_from_checkpoint for deepspeed, by ensuring that the deepspeed engine is the one to load the checkpoint. --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 1f7df7e9f3b0..bf0538516864 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1616,7 +1616,7 @@ def train( if resume_from_checkpoint is None: raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})") - if resume_from_checkpoint is not None and not is_sagemaker_mp_enabled(): + if resume_from_checkpoint is not None and not is_sagemaker_mp_enabled() and args.deepspeed is None: self._load_from_checkpoint(resume_from_checkpoint) # If model was re-initialized, put it on the right device and update self.model_wrapped From da93e8431dfab412f55c256cb5b43e67c76d8267 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Fri, 24 Feb 2023 14:26:18 +0100 Subject: [PATCH 2/2] Empty commit to trigger CI