From 8d8545a0143deb4b3160a41343155562e9ba885f Mon Sep 17 00:00:00 2001 From: arpelarpe Date: Thu, 4 Aug 2022 15:43:18 +0200 Subject: [PATCH 01/11] Added accelerate gradient accumulation wrapper to run_image_classification_no_trainer.py example script --- .../run_image_classification_no_trainer.py | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index f10a54add791..285336dd0f8f 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -213,7 +213,10 @@ def main(): # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() + Accelerator(log_with=args.report_to, logging_dir=args.output_dir, + gradient_accumulation_steps=args.gradient_accumulation_steps) + if args.with_tracking + else Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps) ) logger.info(accelerator.state) # Make one log on every process with the configuration for debugging. @@ -385,7 +388,7 @@ def collate_fn(examples): name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_training_steps=math.ceil(len(train_dataloader)) * args.num_train_epochs ) # Prepare everything with our `accelerator`. @@ -467,17 +470,20 @@ def collate_fn(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 From a8b4322c8f91c193248d27168ec9abee6588c15a Mon Sep 17 00:00:00 2001 From: arpelarpe Date: Thu, 4 Aug 2022 15:53:29 +0200 Subject: [PATCH 02/11] make fixup changes --- .../run_image_classification_no_trainer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 285336dd0f8f..3c241f65f708 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -213,8 +213,11 @@ def main(): # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir, - gradient_accumulation_steps=args.gradient_accumulation_steps) + Accelerator( + log_with=args.report_to, + logging_dir=args.output_dir, + gradient_accumulation_steps=args.gradient_accumulation_steps, + ) if args.with_tracking else Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps) ) @@ -388,7 +391,7 @@ def collate_fn(examples): name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, - num_training_steps=math.ceil(len(train_dataloader)) * args.num_train_epochs + num_training_steps=math.ceil(len(train_dataloader)) * args.num_train_epochs, ) # Prepare everything with our `accelerator`. From eed96f44ab01e26edba211b5cefe5941b374d2d7 Mon Sep 17 00:00:00 2001 From: arpelarpe Date: Thu, 4 Aug 2022 16:54:22 +0200 Subject: [PATCH 03/11] PR comments --- .../run_image_classification_no_trainer.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 3c241f65f708..cf28edc88206 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -212,15 +212,16 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator( - log_with=args.report_to, - logging_dir=args.output_dir, - gradient_accumulation_steps=args.gradient_accumulation_steps, - ) - if args.with_tracking - else Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps) - ) + accelerator_kwargs = { + "gradient_accumulation_steps": args.gradient_accumulation_steps + } + + if args.with_tracking: + accelerator_kwargs["log_with"] = args.report_to + accelerator_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(**accelerator_kwargs) + logger.info(accelerator.state) # Make one log on every process with the configuration for debugging. logging.basicConfig( @@ -383,15 +384,17 @@ def collate_fn(examples): # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + lr_num_training_steps = len(train_dataloader) * num_update_steps_per_epoch if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True + lr_num_training_steps = args.max_train_steps * args.gradient_accumulation_steps lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, - num_training_steps=math.ceil(len(train_dataloader)) * args.num_train_epochs, + num_training_steps=lr_num_training_steps ) # Prepare everything with our `accelerator`. From 1343bc4121b31d7856f121d5f61a056aebd3d3c9 Mon Sep 17 00:00:00 2001 From: arpelarpe Date: Thu, 4 Aug 2022 17:03:21 +0200 Subject: [PATCH 04/11] changed input to Acceletor based on PR comment, ran make fixup --- .../run_image_classification_no_trainer.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index cf28edc88206..a4a784212c9c 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -212,15 +212,13 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator_kwargs = { - "gradient_accumulation_steps": args.gradient_accumulation_steps - } + accelerator_log_kwargs = {} if args.with_tracking: - accelerator_kwargs["log_with"] = args.report_to - accelerator_kwargs["logging_dir"] = args.output_dir + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir - accelerator = Accelerator(**accelerator_kwargs) + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) logger.info(accelerator.state) # Make one log on every process with the configuration for debugging. @@ -394,7 +392,7 @@ def collate_fn(examples): name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, - num_training_steps=lr_num_training_steps + num_training_steps=lr_num_training_steps, ) # Prepare everything with our `accelerator`. From c147053e0f1a395dd01cf53aefe5b1697e774546 Mon Sep 17 00:00:00 2001 From: arpelarpe Date: Fri, 5 Aug 2022 08:44:17 +0200 Subject: [PATCH 05/11] Added comment explaining the sync_gradients statement --- .../image-classification/run_image_classification_no_trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index a4a784212c9c..c57189d89412 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -487,6 +487,7 @@ def collate_fn(examples): lr_scheduler.step() optimizer.zero_grad() + # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 From 956367afa401734d01b4919fa4e993813aa956d0 Mon Sep 17 00:00:00 2001 From: arpelarpe Date: Fri, 5 Aug 2022 09:22:56 +0200 Subject: [PATCH 06/11] Fixed lr scheduler max steps --- .../run_image_classification_no_trainer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index c57189d89412..859583f46272 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -382,17 +382,15 @@ def collate_fn(examples): # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - lr_num_training_steps = len(train_dataloader) * num_update_steps_per_epoch if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True - lr_num_training_steps = args.max_train_steps * args.gradient_accumulation_steps lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, - num_training_steps=lr_num_training_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. From 4b73faf201a98a705c785dc195f9eefc67227548 Mon Sep 17 00:00:00 2001 From: arpelarpe Date: Fri, 5 Aug 2022 09:23:21 +0200 Subject: [PATCH 07/11] Changed run_clm_no_trainer.py script to use accelerate gradient accum wrapper --- .../language-modeling/run_clm_no_trainer.py | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 21dc568fd448..44e34623e4ef 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -249,9 +249,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -567,19 +572,23 @@ def group_texts(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + + accelerator.backward(loss) optimizer.step() - lr_scheduler.step() optimizer.zero_grad() - progress_bar.update(1) - completed_steps += 1 + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + lr_scheduler.step() + progress_bar.update(1) + completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: From 27a79c1013742ab0708f2143731a1fa0bdfc684f Mon Sep 17 00:00:00 2001 From: arpelarpe Date: Fri, 5 Aug 2022 10:58:59 +0200 Subject: [PATCH 08/11] Fixed all scripts except wav2vec2 pretraining to use accelerate gradient accum wrapper --- .../run_image_classification_no_trainer.py | 3 +- .../language-modeling/run_clm_no_trainer.py | 15 ++++---- .../language-modeling/run_mlm_no_trainer.py | 34 ++++++++++------- .../multiple-choice/run_swag_no_trainer.py | 36 +++++++++++------- .../run_qa_beam_search_no_trainer.py | 38 ++++++++++++------- .../question-answering/run_qa_no_trainer.py | 35 ++++++++++------- .../run_semantic_segmentation_no_trainer.py | 34 ++++++++++------- .../run_summarization_no_trainer.py | 33 +++++++++------- 8 files changed, 139 insertions(+), 89 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 859583f46272..1bd190d1303e 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -389,7 +389,7 @@ def collate_fn(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) @@ -479,7 +479,6 @@ def collate_fn(examples): # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() - accelerator.backward(loss) optimizer.step() lr_scheduler.step() diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 44e34623e4ef..3fd67d5fbf66 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -491,8 +491,8 @@ def group_texts(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -579,16 +579,15 @@ def group_texts(examples): # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() - accelerator.backward(loss) optimizer.step() + lr_scheduler.step() optimizer.zero_grad() - # Checks if the accelerator has performed an optimization step behind the scenes - if accelerator.sync_gradients: - lr_scheduler.step() - progress_bar.update(1) - completed_steps += 1 + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index b7b085e5b61b..80dfcf9a9194 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -258,9 +258,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -530,8 +535,8 @@ def group_texts(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -611,17 +616,20 @@ def group_texts(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 7d5d7588c694..7d47a0e7a6d5 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -65,7 +65,7 @@ def parse_args(): - parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task") + parser = argparse.ArgumentParser(description="Finetune a transformers model on a multiple choice task") parser.add_argument( "--dataset_name", type=str, @@ -284,9 +284,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -483,8 +488,8 @@ def preprocess_function(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -567,17 +572,20 @@ def preprocess_function(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index e6c66e379a96..6d2845d4f919 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -297,8 +297,16 @@ def main(): send_example_telemetry("run_qa_beam_search_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers + # in the environment + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -739,8 +747,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * num_update_steps_per_epoch, + num_training_steps=args.max_train_steps * num_update_steps_per_epoch, ) # Prepare everything with our `accelerator`. @@ -818,17 +826,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + + accelerator.backward(loss) + optimizer.step() - lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index ec86d95b5e59..4ca2d143c25f 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -337,9 +337,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -757,8 +762,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -839,17 +844,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 237934b762d5..2b89b3a9016d 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -326,9 +326,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() @@ -487,8 +492,8 @@ def preprocess_val(example_batch): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -563,17 +568,20 @@ def preprocess_val(example_batch): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index ca9ef6ba9fa2..96781b6dcadb 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -330,9 +330,13 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) if args.source_prefix is None and args.model_name_or_path in [ "t5-small", "t5-base", @@ -552,8 +556,8 @@ def postprocess_text(preds, labels): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -635,17 +639,20 @@ def postprocess_text(preds, labels): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 From 5573c54fa00d93e54c9e24c8a334f4cff0c8a35d Mon Sep 17 00:00:00 2001 From: arpelarpe Date: Fri, 5 Aug 2022 21:44:26 +0200 Subject: [PATCH 09/11] Added accelerate gradient accum wrapper for wav2vec2_pretraining_no_trainer.py script --- .../run_wav2vec2_pretraining_no_trainer.py | 96 +++++++++---------- 1 file changed, 46 insertions(+), 50 deletions(-) diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index a3db215d08bd..fd32606e8643 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -368,7 +368,7 @@ def main(): send_example_telemetry("run_wav2vec2_pretraining_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator() + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() @@ -585,60 +585,56 @@ def prepare_dataset(batch): ) percent_masked = num_losses / sub_attention_mask.sum() - # forward - outputs = model(**batch) - - # divide loss by gradient accumulation steps since gradients - # are accumulated for multiple backward passes in PyTorch - loss = outputs.loss / args.gradient_accumulation_steps - accelerator.backward(loss) - - # make sure that `num_losses` is summed for distributed training - # and average gradients over losses of all devices - if accelerator.state.num_processes > 1: - num_losses = accelerator.gather(num_losses).sum() - gradient_multiplier = accelerator.state.num_processes / num_losses - multiply_grads(model.module.parameters(), gradient_multiplier) - else: - multiply_grads(model.parameters(), 1 / num_losses) - - # update step - if (step + 1) % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + with accelerator.accumulate(model): + # forward + outputs = model(**batch) + accelerator.backward(outputs.loss) - # compute grad norm for monitoring - scale = ( - accelerator.scaler._scale.item() - if hasattr(accelerator, "scaler") and accelerator.scaler is not None - else 1 - ) + # make sure that `num_losses` is summed for distributed training + # and average gradients over losses of all devices if accelerator.state.num_processes > 1: - grad_norm = get_grad_norm(model.module.parameters(), scale) + num_losses = accelerator.gather(num_losses).sum() + gradient_multiplier = accelerator.state.num_processes / num_losses + multiply_grads(model.module.parameters(), gradient_multiplier) else: - grad_norm = get_grad_norm(model.parameters(), scale) - - # update parameters - optimizer.step() - optimizer.zero_grad() - - if not accelerator.optimizer_step_was_skipped: - lr_scheduler.step() - elif accelerator.is_local_main_process: - progress_bar.write( - f"Gradients have overflown - skipping update step... Updating gradient scale to {scale}..." + multiply_grads(model.parameters(), 1 / num_losses) + + # Checks if the accelerator will perform an optimization step behind the scenes + if accelerator.sync_gradients: + # compute grad norm for monitoring + scale = ( + accelerator.scaler._scale.item() + if hasattr(accelerator, "scaler") and accelerator.scaler is not None + else 1 ) + if accelerator.state.num_processes > 1: + grad_norm = get_grad_norm(model.module.parameters(), scale) + else: + grad_norm = get_grad_norm(model.parameters(), scale) + + # update parameters + optimizer.step() + optimizer.zero_grad() + + if not accelerator.optimizer_step_was_skipped: + lr_scheduler.step() + elif accelerator.is_local_main_process: + progress_bar.write( + f"Gradients have overflown - skipping update step... Updating gradient scale to {scale}..." + ) + + # update gumbel temperature + gumbel_temperature = max( + args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps, + args.min_gumbel_temperature, + ) + if hasattr(model, "module"): + model.module.set_gumbel_temperature(gumbel_temperature) + else: + model.set_gumbel_temperature(gumbel_temperature) - # update gumbel temperature - gumbel_temperature = max( - args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps, - args.min_gumbel_temperature, - ) - if hasattr(model, "module"): - model.module.set_gumbel_temperature(gumbel_temperature) - else: - model.set_gumbel_temperature(gumbel_temperature) - - progress_bar.update(1) - completed_steps += 1 + progress_bar.update(1) + completed_steps += 1 # 6. Log all results if (step + 1) % (args.gradient_accumulation_steps * args.logging_steps) == 0: From d7d8b09b9bf96c1aeeeff3cc004a5d147ae7486f Mon Sep 17 00:00:00 2001 From: arpelarpe Date: Fri, 5 Aug 2022 21:50:29 +0200 Subject: [PATCH 10/11] make fixup and lr_scheduler step inserted back into run_qa_beam_search_no_trainer.py --- .../question-answering/run_qa_beam_search_no_trainer.py | 1 + .../speech-pretraining/run_wav2vec2_pretraining_no_trainer.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 6d2845d4f919..4237a1702a19 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -837,6 +837,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): accelerator.backward(loss) optimizer.step() + lr_scheduler.step() optimizer.zero_grad() # Checks if the accelerator has performed an optimization step behind the scenes diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index fd32606e8643..8b6f2cc99f1f 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -588,7 +588,8 @@ def prepare_dataset(batch): with accelerator.accumulate(model): # forward outputs = model(**batch) - accelerator.backward(outputs.loss) + loss = outputs.loss + accelerator.backward(loss) # make sure that `num_losses` is summed for distributed training # and average gradients over losses of all devices From 01f7f07b09e24c1c3bcea4d3516bf4c4bbd39886 Mon Sep 17 00:00:00 2001 From: arpelarpe Date: Mon, 8 Aug 2022 20:38:51 +0200 Subject: [PATCH 11/11] removed changes to run_wav2vec2_pretraining_no_trainer.py script and fixed using wrong constant in qa_beam_search_no_trainer.py script --- .../run_qa_beam_search_no_trainer.py | 4 +- .../run_wav2vec2_pretraining_no_trainer.py | 97 ++++++++++--------- 2 files changed, 52 insertions(+), 49 deletions(-) diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 4237a1702a19..ce47f1e1dee0 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -747,8 +747,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps * num_update_steps_per_epoch, - num_training_steps=args.max_train_steps * num_update_steps_per_epoch, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index 8b6f2cc99f1f..a3db215d08bd 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -368,7 +368,7 @@ def main(): send_example_telemetry("run_wav2vec2_pretraining_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps) + accelerator = Accelerator() logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() @@ -585,57 +585,60 @@ def prepare_dataset(batch): ) percent_masked = num_losses / sub_attention_mask.sum() - with accelerator.accumulate(model): - # forward - outputs = model(**batch) - loss = outputs.loss - accelerator.backward(loss) + # forward + outputs = model(**batch) + + # divide loss by gradient accumulation steps since gradients + # are accumulated for multiple backward passes in PyTorch + loss = outputs.loss / args.gradient_accumulation_steps + accelerator.backward(loss) + + # make sure that `num_losses` is summed for distributed training + # and average gradients over losses of all devices + if accelerator.state.num_processes > 1: + num_losses = accelerator.gather(num_losses).sum() + gradient_multiplier = accelerator.state.num_processes / num_losses + multiply_grads(model.module.parameters(), gradient_multiplier) + else: + multiply_grads(model.parameters(), 1 / num_losses) + + # update step + if (step + 1) % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: - # make sure that `num_losses` is summed for distributed training - # and average gradients over losses of all devices + # compute grad norm for monitoring + scale = ( + accelerator.scaler._scale.item() + if hasattr(accelerator, "scaler") and accelerator.scaler is not None + else 1 + ) if accelerator.state.num_processes > 1: - num_losses = accelerator.gather(num_losses).sum() - gradient_multiplier = accelerator.state.num_processes / num_losses - multiply_grads(model.module.parameters(), gradient_multiplier) + grad_norm = get_grad_norm(model.module.parameters(), scale) else: - multiply_grads(model.parameters(), 1 / num_losses) - - # Checks if the accelerator will perform an optimization step behind the scenes - if accelerator.sync_gradients: - # compute grad norm for monitoring - scale = ( - accelerator.scaler._scale.item() - if hasattr(accelerator, "scaler") and accelerator.scaler is not None - else 1 - ) - if accelerator.state.num_processes > 1: - grad_norm = get_grad_norm(model.module.parameters(), scale) - else: - grad_norm = get_grad_norm(model.parameters(), scale) - - # update parameters - optimizer.step() - optimizer.zero_grad() - - if not accelerator.optimizer_step_was_skipped: - lr_scheduler.step() - elif accelerator.is_local_main_process: - progress_bar.write( - f"Gradients have overflown - skipping update step... Updating gradient scale to {scale}..." - ) - - # update gumbel temperature - gumbel_temperature = max( - args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps, - args.min_gumbel_temperature, + grad_norm = get_grad_norm(model.parameters(), scale) + + # update parameters + optimizer.step() + optimizer.zero_grad() + + if not accelerator.optimizer_step_was_skipped: + lr_scheduler.step() + elif accelerator.is_local_main_process: + progress_bar.write( + f"Gradients have overflown - skipping update step... Updating gradient scale to {scale}..." ) - if hasattr(model, "module"): - model.module.set_gumbel_temperature(gumbel_temperature) - else: - model.set_gumbel_temperature(gumbel_temperature) - progress_bar.update(1) - completed_steps += 1 + # update gumbel temperature + gumbel_temperature = max( + args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps, + args.min_gumbel_temperature, + ) + if hasattr(model, "module"): + model.module.set_gumbel_temperature(gumbel_temperature) + else: + model.set_gumbel_temperature(gumbel_temperature) + + progress_bar.update(1) + completed_steps += 1 # 6. Log all results if (step + 1) % (args.gradient_accumulation_steps * args.logging_steps) == 0: