diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index f10a54add791..1bd190d1303e 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -212,9 +212,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + logger.info(accelerator.state) # Make one log on every process with the configuration for debugging. logging.basicConfig( @@ -384,8 +389,8 @@ def collate_fn(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -467,17 +472,20 @@ def collate_fn(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 21dc568fd448..3fd67d5fbf66 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -249,9 +249,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -486,8 +491,8 @@ def group_texts(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -567,17 +572,20 @@ def group_texts(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index b7b085e5b61b..80dfcf9a9194 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -258,9 +258,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -530,8 +535,8 @@ def group_texts(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -611,17 +616,20 @@ def group_texts(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 7d5d7588c694..7d47a0e7a6d5 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -65,7 +65,7 @@ def parse_args(): - parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task") + parser = argparse.ArgumentParser(description="Finetune a transformers model on a multiple choice task") parser.add_argument( "--dataset_name", type=str, @@ -284,9 +284,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -483,8 +488,8 @@ def preprocess_function(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -567,17 +572,20 @@ def preprocess_function(examples): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index e6c66e379a96..ce47f1e1dee0 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -297,8 +297,16 @@ def main(): send_example_telemetry("run_qa_beam_search_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers + # in the environment + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -739,8 +747,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -818,17 +826,22 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + + accelerator.backward(loss) + optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index ec86d95b5e59..4ca2d143c25f 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -337,9 +337,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -757,8 +762,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -839,17 +844,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 237934b762d5..2b89b3a9016d 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -326,9 +326,14 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() @@ -487,8 +492,8 @@ def preprocess_val(example_batch): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -563,17 +568,20 @@ def preprocess_val(example_batch): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index ca9ef6ba9fa2..96781b6dcadb 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -330,9 +330,13 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["logging_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) if args.source_prefix is None and args.model_name_or_path in [ "t5-small", "t5-base", @@ -552,8 +556,8 @@ def postprocess_text(preds, labels): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. @@ -635,17 +639,20 @@ def postprocess_text(preds, labels): if resume_step is not None and step < resume_step: completed_steps += 1 continue - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1