From 8d8545a0143deb4b3160a41343155562e9ba885f Mon Sep 17 00:00:00 2001
From: arpelarpe <rasmus.arpe@gmail.com>
Date: Thu, 4 Aug 2022 15:43:18 +0200
Subject: [PATCH 01/11] Added accelerate gradient accumulation wrapper to
 run_image_classification_no_trainer.py example script

---
 .../run_image_classification_no_trainer.py    | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index f10a54add791..285336dd0f8f 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -213,7 +213,10 @@ def main():
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
     accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+        Accelerator(log_with=args.report_to, logging_dir=args.output_dir,
+                    gradient_accumulation_steps=args.gradient_accumulation_steps)
+        if args.with_tracking
+        else Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
     )
     logger.info(accelerator.state)
     # Make one log on every process with the configuration for debugging.
@@ -385,7 +388,7 @@ def collate_fn(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_training_steps=math.ceil(len(train_dataloader)) * args.num_train_epochs
     )
 
     # Prepare everything with our `accelerator`.
@@ -467,17 +470,20 @@ def collate_fn(examples):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 

From a8b4322c8f91c193248d27168ec9abee6588c15a Mon Sep 17 00:00:00 2001
From: arpelarpe <rasmus.arpe@gmail.com>
Date: Thu, 4 Aug 2022 15:53:29 +0200
Subject: [PATCH 02/11] make fixup changes

---
 .../run_image_classification_no_trainer.py               | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 285336dd0f8f..3c241f65f708 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -213,8 +213,11 @@ def main():
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
     accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir,
-                    gradient_accumulation_steps=args.gradient_accumulation_steps)
+        Accelerator(
+            log_with=args.report_to,
+            logging_dir=args.output_dir,
+            gradient_accumulation_steps=args.gradient_accumulation_steps,
+        )
         if args.with_tracking
         else Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
     )
@@ -388,7 +391,7 @@ def collate_fn(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=math.ceil(len(train_dataloader)) * args.num_train_epochs
+        num_training_steps=math.ceil(len(train_dataloader)) * args.num_train_epochs,
     )
 
     # Prepare everything with our `accelerator`.

From eed96f44ab01e26edba211b5cefe5941b374d2d7 Mon Sep 17 00:00:00 2001
From: arpelarpe <rasmus.arpe@gmail.com>
Date: Thu, 4 Aug 2022 16:54:22 +0200
Subject: [PATCH 03/11] PR comments

---
 .../run_image_classification_no_trainer.py    | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 3c241f65f708..cf28edc88206 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -212,15 +212,16 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(
-            log_with=args.report_to,
-            logging_dir=args.output_dir,
-            gradient_accumulation_steps=args.gradient_accumulation_steps,
-        )
-        if args.with_tracking
-        else Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
-    )
+    accelerator_kwargs = {
+        "gradient_accumulation_steps": args.gradient_accumulation_steps
+    }
+
+    if args.with_tracking:
+        accelerator_kwargs["log_with"] = args.report_to
+        accelerator_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(**accelerator_kwargs)
+
     logger.info(accelerator.state)
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
@@ -383,15 +384,17 @@ def collate_fn(examples):
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    lr_num_training_steps = len(train_dataloader) * num_update_steps_per_epoch
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
         overrode_max_train_steps = True
+        lr_num_training_steps = args.max_train_steps * args.gradient_accumulation_steps
 
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=math.ceil(len(train_dataloader)) * args.num_train_epochs,
+        num_training_steps=lr_num_training_steps
     )
 
     # Prepare everything with our `accelerator`.

From 1343bc4121b31d7856f121d5f61a056aebd3d3c9 Mon Sep 17 00:00:00 2001
From: arpelarpe <rasmus.arpe@gmail.com>
Date: Thu, 4 Aug 2022 17:03:21 +0200
Subject: [PATCH 04/11] changed input to Acceletor based on PR comment, ran
 make fixup

---
 .../run_image_classification_no_trainer.py           | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index cf28edc88206..a4a784212c9c 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -212,15 +212,13 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator_kwargs = {
-        "gradient_accumulation_steps": args.gradient_accumulation_steps
-    }
+    accelerator_log_kwargs = {}
 
     if args.with_tracking:
-        accelerator_kwargs["log_with"] = args.report_to
-        accelerator_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
 
-    accelerator = Accelerator(**accelerator_kwargs)
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
     logger.info(accelerator.state)
     # Make one log on every process with the configuration for debugging.
@@ -394,7 +392,7 @@ def collate_fn(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=lr_num_training_steps
+        num_training_steps=lr_num_training_steps,
     )
 
     # Prepare everything with our `accelerator`.

From c147053e0f1a395dd01cf53aefe5b1697e774546 Mon Sep 17 00:00:00 2001
From: arpelarpe <rasmus.arpe@gmail.com>
Date: Fri, 5 Aug 2022 08:44:17 +0200
Subject: [PATCH 05/11] Added comment explaining the sync_gradients statement

---
 .../image-classification/run_image_classification_no_trainer.py  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index a4a784212c9c..c57189d89412 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -487,6 +487,7 @@ def collate_fn(examples):
                 lr_scheduler.step()
                 optimizer.zero_grad()
 
+            # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1

From 956367afa401734d01b4919fa4e993813aa956d0 Mon Sep 17 00:00:00 2001
From: arpelarpe <rasmus.arpe@gmail.com>
Date: Fri, 5 Aug 2022 09:22:56 +0200
Subject: [PATCH 06/11] Fixed lr scheduler max steps

---
 .../run_image_classification_no_trainer.py                    | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index c57189d89412..859583f46272 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -382,17 +382,15 @@ def collate_fn(examples):
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    lr_num_training_steps = len(train_dataloader) * num_update_steps_per_epoch
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
         overrode_max_train_steps = True
-        lr_num_training_steps = args.max_train_steps * args.gradient_accumulation_steps
 
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=lr_num_training_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.

From 4b73faf201a98a705c785dc195f9eefc67227548 Mon Sep 17 00:00:00 2001
From: arpelarpe <rasmus.arpe@gmail.com>
Date: Fri, 5 Aug 2022 09:23:21 +0200
Subject: [PATCH 07/11] Changed run_clm_no_trainer.py script to use accelerate
 gradient accum wrapper

---
 .../language-modeling/run_clm_no_trainer.py   | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 21dc568fd448..44e34623e4ef 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -249,9 +249,14 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -567,19 +572,23 @@ def group_texts(examples):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+
+                accelerator.backward(loss)
                 optimizer.step()
-                lr_scheduler.step()
                 optimizer.zero_grad()
-                progress_bar.update(1)
-                completed_steps += 1
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    lr_scheduler.step()
+                    progress_bar.update(1)
+                    completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:

From 27a79c1013742ab0708f2143731a1fa0bdfc684f Mon Sep 17 00:00:00 2001
From: arpelarpe <rasmus.arpe@gmail.com>
Date: Fri, 5 Aug 2022 10:58:59 +0200
Subject: [PATCH 08/11] Fixed all scripts except wav2vec2 pretraining to use
 accelerate gradient accum wrapper

---
 .../run_image_classification_no_trainer.py    |  3 +-
 .../language-modeling/run_clm_no_trainer.py   | 15 ++++----
 .../language-modeling/run_mlm_no_trainer.py   | 34 ++++++++++-------
 .../multiple-choice/run_swag_no_trainer.py    | 36 +++++++++++-------
 .../run_qa_beam_search_no_trainer.py          | 38 ++++++++++++-------
 .../question-answering/run_qa_no_trainer.py   | 35 ++++++++++-------
 .../run_semantic_segmentation_no_trainer.py   | 34 ++++++++++-------
 .../run_summarization_no_trainer.py           | 33 +++++++++-------
 8 files changed, 139 insertions(+), 89 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 859583f46272..1bd190d1303e 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -389,7 +389,7 @@ def collate_fn(examples):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
         num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
@@ -479,7 +479,6 @@ def collate_fn(examples):
                 # We keep track of the loss at each epoch
                 if args.with_tracking:
                     total_loss += loss.detach().float()
-
                 accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 44e34623e4ef..3fd67d5fbf66 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -491,8 +491,8 @@ def group_texts(examples):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -579,16 +579,15 @@ def group_texts(examples):
                 # We keep track of the loss at each epoch
                 if args.with_tracking:
                     total_loss += loss.detach().float()
-
                 accelerator.backward(loss)
                 optimizer.step()
+                lr_scheduler.step()
                 optimizer.zero_grad()
 
-                # Checks if the accelerator has performed an optimization step behind the scenes
-                if accelerator.sync_gradients:
-                    lr_scheduler.step()
-                    progress_bar.update(1)
-                    completed_steps += 1
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index b7b085e5b61b..80dfcf9a9194 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -258,9 +258,14 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -530,8 +535,8 @@ def group_texts(examples):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -611,17 +616,20 @@ def group_texts(examples):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 7d5d7588c694..7d47a0e7a6d5 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -65,7 +65,7 @@
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task")
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a multiple choice task")
     parser.add_argument(
         "--dataset_name",
         type=str,
@@ -284,9 +284,14 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -483,8 +488,8 @@ def preprocess_function(examples):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -567,17 +572,20 @@ def preprocess_function(examples):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index e6c66e379a96..6d2845d4f919 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -297,8 +297,16 @@ def main():
     send_example_telemetry("run_qa_beam_search_no_trainer", args)
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
-    accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers
+    # in the environment
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -739,8 +747,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * num_update_steps_per_epoch,
+        num_training_steps=args.max_train_steps * num_update_steps_per_epoch,
     )
 
     # Prepare everything with our `accelerator`.
@@ -818,17 +826,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+
+                accelerator.backward(loss)
+
                 optimizer.step()
-                lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index ec86d95b5e59..4ca2d143c25f 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -337,9 +337,14 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -757,8 +762,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -839,17 +844,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 237934b762d5..2b89b3a9016d 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -326,9 +326,14 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     logger.info(accelerator.state, main_process_only=False)
     if accelerator.is_local_main_process:
         datasets.utils.logging.set_verbosity_warning()
@@ -487,8 +492,8 @@ def preprocess_val(example_batch):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -563,17 +568,20 @@ def preprocess_val(example_batch):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index ca9ef6ba9fa2..96781b6dcadb 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -330,9 +330,13 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
     if args.source_prefix is None and args.model_name_or_path in [
         "t5-small",
         "t5-base",
@@ -552,8 +556,8 @@ def postprocess_text(preds, labels):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -635,17 +639,20 @@ def postprocess_text(preds, labels):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 

From 5573c54fa00d93e54c9e24c8a334f4cff0c8a35d Mon Sep 17 00:00:00 2001
From: arpelarpe <rasmus.arpe@gmail.com>
Date: Fri, 5 Aug 2022 21:44:26 +0200
Subject: [PATCH 09/11] Added accelerate gradient accum wrapper for
 wav2vec2_pretraining_no_trainer.py script

---
 .../run_wav2vec2_pretraining_no_trainer.py    | 96 +++++++++----------
 1 file changed, 46 insertions(+), 50 deletions(-)

diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
index a3db215d08bd..fd32606e8643 100755
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -368,7 +368,7 @@ def main():
     send_example_telemetry("run_wav2vec2_pretraining_no_trainer", args)
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
     logger.info(accelerator.state, main_process_only=False)
     if accelerator.is_local_main_process:
         datasets.utils.logging.set_verbosity_warning()
@@ -585,60 +585,56 @@ def prepare_dataset(batch):
             )
             percent_masked = num_losses / sub_attention_mask.sum()
 
-            # forward
-            outputs = model(**batch)
-
-            # divide loss by gradient accumulation steps since gradients
-            # are accumulated for multiple backward passes in PyTorch
-            loss = outputs.loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-
-            # make sure that `num_losses` is summed for distributed training
-            # and average gradients over losses of all devices
-            if accelerator.state.num_processes > 1:
-                num_losses = accelerator.gather(num_losses).sum()
-                gradient_multiplier = accelerator.state.num_processes / num_losses
-                multiply_grads(model.module.parameters(), gradient_multiplier)
-            else:
-                multiply_grads(model.parameters(), 1 / num_losses)
-
-            # update step
-            if (step + 1) % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+            with accelerator.accumulate(model):
+                # forward
+                outputs = model(**batch)
+                accelerator.backward(outputs.loss)
 
-                # compute grad norm for monitoring
-                scale = (
-                    accelerator.scaler._scale.item()
-                    if hasattr(accelerator, "scaler") and accelerator.scaler is not None
-                    else 1
-                )
+                # make sure that `num_losses` is summed for distributed training
+                # and average gradients over losses of all devices
                 if accelerator.state.num_processes > 1:
-                    grad_norm = get_grad_norm(model.module.parameters(), scale)
+                    num_losses = accelerator.gather(num_losses).sum()
+                    gradient_multiplier = accelerator.state.num_processes / num_losses
+                    multiply_grads(model.module.parameters(), gradient_multiplier)
                 else:
-                    grad_norm = get_grad_norm(model.parameters(), scale)
-
-                # update parameters
-                optimizer.step()
-                optimizer.zero_grad()
-
-                if not accelerator.optimizer_step_was_skipped:
-                    lr_scheduler.step()
-                elif accelerator.is_local_main_process:
-                    progress_bar.write(
-                        f"Gradients have overflown - skipping update step... Updating gradient scale to {scale}..."
+                    multiply_grads(model.parameters(), 1 / num_losses)
+
+                # Checks if the accelerator will perform an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    # compute grad norm for monitoring
+                    scale = (
+                        accelerator.scaler._scale.item()
+                        if hasattr(accelerator, "scaler") and accelerator.scaler is not None
+                        else 1
                     )
+                    if accelerator.state.num_processes > 1:
+                        grad_norm = get_grad_norm(model.module.parameters(), scale)
+                    else:
+                        grad_norm = get_grad_norm(model.parameters(), scale)
+
+                    # update parameters
+                    optimizer.step()
+                    optimizer.zero_grad()
+
+                    if not accelerator.optimizer_step_was_skipped:
+                        lr_scheduler.step()
+                    elif accelerator.is_local_main_process:
+                        progress_bar.write(
+                            f"Gradients have overflown - skipping update step... Updating gradient scale to {scale}..."
+                        )
+
+                    # update gumbel temperature
+                    gumbel_temperature = max(
+                        args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps,
+                        args.min_gumbel_temperature,
+                    )
+                    if hasattr(model, "module"):
+                        model.module.set_gumbel_temperature(gumbel_temperature)
+                    else:
+                        model.set_gumbel_temperature(gumbel_temperature)
 
-                # update gumbel temperature
-                gumbel_temperature = max(
-                    args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps,
-                    args.min_gumbel_temperature,
-                )
-                if hasattr(model, "module"):
-                    model.module.set_gumbel_temperature(gumbel_temperature)
-                else:
-                    model.set_gumbel_temperature(gumbel_temperature)
-
-                progress_bar.update(1)
-                completed_steps += 1
+                    progress_bar.update(1)
+                    completed_steps += 1
 
             # 6. Log all results
             if (step + 1) % (args.gradient_accumulation_steps * args.logging_steps) == 0:

From d7d8b09b9bf96c1aeeeff3cc004a5d147ae7486f Mon Sep 17 00:00:00 2001
From: arpelarpe <rasmus.arpe@gmail.com>
Date: Fri, 5 Aug 2022 21:50:29 +0200
Subject: [PATCH 10/11] make fixup and lr_scheduler step inserted back into
 run_qa_beam_search_no_trainer.py

---
 .../question-answering/run_qa_beam_search_no_trainer.py        | 1 +
 .../speech-pretraining/run_wav2vec2_pretraining_no_trainer.py  | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 6d2845d4f919..4237a1702a19 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -837,6 +837,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
                 accelerator.backward(loss)
 
                 optimizer.step()
+                lr_scheduler.step()
                 optimizer.zero_grad()
 
             # Checks if the accelerator has performed an optimization step behind the scenes
diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
index fd32606e8643..8b6f2cc99f1f 100755
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -588,7 +588,8 @@ def prepare_dataset(batch):
             with accelerator.accumulate(model):
                 # forward
                 outputs = model(**batch)
-                accelerator.backward(outputs.loss)
+                loss = outputs.loss
+                accelerator.backward(loss)
 
                 # make sure that `num_losses` is summed for distributed training
                 # and average gradients over losses of all devices

From 01f7f07b09e24c1c3bcea4d3516bf4c4bbd39886 Mon Sep 17 00:00:00 2001
From: arpelarpe <rasmus.arpe@gmail.com>
Date: Mon, 8 Aug 2022 20:38:51 +0200
Subject: [PATCH 11/11] removed changes to
 run_wav2vec2_pretraining_no_trainer.py script and fixed using wrong constant
 in qa_beam_search_no_trainer.py script

---
 .../run_qa_beam_search_no_trainer.py          |  4 +-
 .../run_wav2vec2_pretraining_no_trainer.py    | 97 ++++++++++---------
 2 files changed, 52 insertions(+), 49 deletions(-)

diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 4237a1702a19..ce47f1e1dee0 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -747,8 +747,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps * num_update_steps_per_epoch,
-        num_training_steps=args.max_train_steps * num_update_steps_per_epoch,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
index 8b6f2cc99f1f..a3db215d08bd 100755
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -368,7 +368,7 @@ def main():
     send_example_telemetry("run_wav2vec2_pretraining_no_trainer", args)
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
+    accelerator = Accelerator()
     logger.info(accelerator.state, main_process_only=False)
     if accelerator.is_local_main_process:
         datasets.utils.logging.set_verbosity_warning()
@@ -585,57 +585,60 @@ def prepare_dataset(batch):
             )
             percent_masked = num_losses / sub_attention_mask.sum()
 
-            with accelerator.accumulate(model):
-                # forward
-                outputs = model(**batch)
-                loss = outputs.loss
-                accelerator.backward(loss)
+            # forward
+            outputs = model(**batch)
+
+            # divide loss by gradient accumulation steps since gradients
+            # are accumulated for multiple backward passes in PyTorch
+            loss = outputs.loss / args.gradient_accumulation_steps
+            accelerator.backward(loss)
+
+            # make sure that `num_losses` is summed for distributed training
+            # and average gradients over losses of all devices
+            if accelerator.state.num_processes > 1:
+                num_losses = accelerator.gather(num_losses).sum()
+                gradient_multiplier = accelerator.state.num_processes / num_losses
+                multiply_grads(model.module.parameters(), gradient_multiplier)
+            else:
+                multiply_grads(model.parameters(), 1 / num_losses)
+
+            # update step
+            if (step + 1) % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
 
-                # make sure that `num_losses` is summed for distributed training
-                # and average gradients over losses of all devices
+                # compute grad norm for monitoring
+                scale = (
+                    accelerator.scaler._scale.item()
+                    if hasattr(accelerator, "scaler") and accelerator.scaler is not None
+                    else 1
+                )
                 if accelerator.state.num_processes > 1:
-                    num_losses = accelerator.gather(num_losses).sum()
-                    gradient_multiplier = accelerator.state.num_processes / num_losses
-                    multiply_grads(model.module.parameters(), gradient_multiplier)
+                    grad_norm = get_grad_norm(model.module.parameters(), scale)
                 else:
-                    multiply_grads(model.parameters(), 1 / num_losses)
-
-                # Checks if the accelerator will perform an optimization step behind the scenes
-                if accelerator.sync_gradients:
-                    # compute grad norm for monitoring
-                    scale = (
-                        accelerator.scaler._scale.item()
-                        if hasattr(accelerator, "scaler") and accelerator.scaler is not None
-                        else 1
-                    )
-                    if accelerator.state.num_processes > 1:
-                        grad_norm = get_grad_norm(model.module.parameters(), scale)
-                    else:
-                        grad_norm = get_grad_norm(model.parameters(), scale)
-
-                    # update parameters
-                    optimizer.step()
-                    optimizer.zero_grad()
-
-                    if not accelerator.optimizer_step_was_skipped:
-                        lr_scheduler.step()
-                    elif accelerator.is_local_main_process:
-                        progress_bar.write(
-                            f"Gradients have overflown - skipping update step... Updating gradient scale to {scale}..."
-                        )
-
-                    # update gumbel temperature
-                    gumbel_temperature = max(
-                        args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps,
-                        args.min_gumbel_temperature,
+                    grad_norm = get_grad_norm(model.parameters(), scale)
+
+                # update parameters
+                optimizer.step()
+                optimizer.zero_grad()
+
+                if not accelerator.optimizer_step_was_skipped:
+                    lr_scheduler.step()
+                elif accelerator.is_local_main_process:
+                    progress_bar.write(
+                        f"Gradients have overflown - skipping update step... Updating gradient scale to {scale}..."
                     )
-                    if hasattr(model, "module"):
-                        model.module.set_gumbel_temperature(gumbel_temperature)
-                    else:
-                        model.set_gumbel_temperature(gumbel_temperature)
 
-                    progress_bar.update(1)
-                    completed_steps += 1
+                # update gumbel temperature
+                gumbel_temperature = max(
+                    args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps,
+                    args.min_gumbel_temperature,
+                )
+                if hasattr(model, "module"):
+                    model.module.set_gumbel_temperature(gumbel_temperature)
+                else:
+                    model.set_gumbel_temperature(gumbel_temperature)
+
+                progress_bar.update(1)
+                completed_steps += 1
 
             # 6. Log all results
             if (step + 1) % (args.gradient_accumulation_steps * args.logging_steps) == 0: