From a4174b6588e689e2b7a69de4fedcba3544b7f3c5 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Tue, 5 Apr 2022 16:44:11 -0400 Subject: [PATCH 1/8] Update first script --- .../language-modeling/run_clm_no_trainer.py | 92 +++++++++++++++++-- 1 file changed, 85 insertions(+), 7 deletions(-) diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index f2f678dd8a5c..3fa22a713e1b 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -185,6 +185,23 @@ def parse_args(): "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + required=False, + help="Whether to load in all available experiment trackers from the environment and use them for logging.", + ) args = parser.parse_args() # Sanity checks @@ -208,7 +225,11 @@ def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment + if args.with_tracking: + accelerator = Accelerator(log_with="all") + else: + accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -427,16 +448,11 @@ def group_texts(examples): ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) - # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader - ) - # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties. if accelerator.distributed_type == DistributedType.TPU: model.tie_weights() - # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be + # Note -> the training dataloader needs to be prepared before we grab its length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. @@ -453,6 +469,23 @@ def group_texts(examples): num_training_steps=args.max_train_steps, ) + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + # Figure out how many steps we should save the Accelerator states + if hasattr(args.checkpointing_steps, "isdigit"): + checkpointing_steps = args.checkpointing_steps + if args.checkpointing_steps.isdigit(): + checkpointing_steps = int(args.checkpointing_steps) + else: + checkpointing_steps = None + + # We need to initialize the trackers we use, and also store our configuration + if args.with_tracking: + accelerator.init_trackers("clm_no_trainer", args) + # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -467,11 +500,39 @@ def group_texts(examples): progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + # Potentially load in the weights and states from a previous save + state_restored = True + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") + accelerator.load_state(args.resume_from_checkpoint) + resume_step = None + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + if "epoch" in path.name: + num_epochs -= int(path.name.replace("epoch_", "")) + else: + resume_step = int(path.name.replace("step_", "")) + num_epochs -= resume_step // len(train_dataloader) + resume_step = (num_epochs * len(train_dataloader)) - resume_step + state_restored = False + for epoch in range(args.num_train_epochs): model.train() + if args.with_tracking: + total_loss = 0 for step, batch in enumerate(train_dataloader): + # We need to skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == 0 and step < resume_step: + continue outputs = model(**batch) loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: @@ -481,6 +542,10 @@ def group_texts(examples): progress_bar.update(1) completed_steps += 1 + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + accelerator.save_state(f"step_{completed_steps}") + if completed_steps >= args.max_train_steps: break @@ -502,6 +567,16 @@ def group_texts(examples): logger.info(f"epoch {epoch}: perplexity: {perplexity}") + if args.with_tracking: + accelerator.log( + { + "perplexity": perplexity, + "train_loss": total_loss, + "epoch": epoch, + }, + step=completed_steps, + ) + if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) @@ -512,6 +587,9 @@ def group_texts(examples): commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) + if args.checkpointing_steps == "epoch": + accelerator.save_state(f"epoch_{epoch}") + if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) From 041cbf668bc6ddcb98201f45daecd301e38b1acb Mon Sep 17 00:00:00 2001 From: Zachary Mueller Date: Wed, 6 Apr 2022 10:29:46 -0400 Subject: [PATCH 2/8] Reduce to a one-liner Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- examples/pytorch/language-modeling/run_clm_no_trainer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 3fa22a713e1b..31ff986c9663 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -226,10 +226,7 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - if args.with_tracking: - accelerator = Accelerator(log_with="all") - else: - accelerator = Accelerator() + accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", From b8c9418823e4f6d63449ea94cb9ca6a9f2225bb2 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Wed, 6 Apr 2022 10:38:58 -0400 Subject: [PATCH 3/8] Wrap up nits --- .../language-modeling/run_clm_no_trainer.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 31ff986c9663..76eca5486939 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -449,9 +449,6 @@ def group_texts(examples): if accelerator.distributed_type == DistributedType.TPU: model.tie_weights() - # Note -> the training dataloader needs to be prepared before we grab its length below (cause its length will be - # shorter in multiprocess) - # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: @@ -498,24 +495,23 @@ def group_texts(examples): completed_steps = 0 # Potentially load in the weights and states from a previous save - state_restored = True if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) resume_step = None + path = args.resume_from_checkpoint else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path.name: - num_epochs -= int(path.name.replace("epoch_", "")) - else: - resume_step = int(path.name.replace("step_", "")) - num_epochs -= resume_step // len(train_dataloader) - resume_step = (num_epochs * len(train_dataloader)) - resume_step - state_restored = False + if "epoch" in path: + args.num_train_epochs -= int(path.replace("epoch_", "")) + else: + resume_step = int(path.replace("step_", "")) + args.num_train_epochs -= resume_step // len(train_dataloader) + resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step for epoch in range(args.num_train_epochs): model.train() From 0451ded39c5c171b0b5c02b07f7df19f0aaf2be2 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Wed, 6 Apr 2022 10:51:14 -0400 Subject: [PATCH 4/8] mlm + swag --- .../language-modeling/run_mlm_no_trainer.py | 86 +++++++++++++++++-- .../multiple-choice/run_swag_no_trainer.py | 86 +++++++++++++++++-- 2 files changed, 157 insertions(+), 15 deletions(-) diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 5d2ea0d4ea86..7d04867ab0de 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -194,6 +194,23 @@ def parse_args(): "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + required=False, + help="Whether to load in all available experiment trackers from the environment and use them for logging.", + ) args = parser.parse_args() # Sanity checks @@ -219,7 +236,8 @@ def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment + accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -468,11 +486,6 @@ def group_texts(examples): ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) - # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader - ) - # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties. if accelerator.distributed_type == DistributedType.TPU: model.tie_weights() @@ -494,6 +507,23 @@ def group_texts(examples): num_training_steps=args.max_train_steps, ) + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + # Figure out how many steps we should save the Accelerator states + if hasattr(args.checkpointing_steps, "isdigit"): + checkpointing_steps = args.checkpointing_steps + if args.checkpointing_steps.isdigit(): + checkpointing_steps = int(args.checkpointing_steps) + else: + checkpointing_steps = None + + # We need to initialize the trackers we use, and also store our configuration + if args.with_tracking: + accelerator.init_trackers("clm_no_trainer", args) + # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -508,11 +538,38 @@ def group_texts(examples): progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") + accelerator.load_state(args.resume_from_checkpoint) + resume_step = None + path = args.resume_from_checkpoint + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + if "epoch" in path: + args.num_train_epochs -= int(path.replace("epoch_", "")) + else: + resume_step = int(path.replace("step_", "")) + args.num_train_epochs -= resume_step // len(train_dataloader) + resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + for epoch in range(args.num_train_epochs): model.train() + if args.with_tracking: + total_loss = 0 for step, batch in enumerate(train_dataloader): + # We need to skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == 0 and step < resume_step: + continue outputs = model(**batch) loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: @@ -522,6 +579,10 @@ def group_texts(examples): progress_bar.update(1) completed_steps += 1 + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + accelerator.save_state(f"step_{completed_steps}") + if completed_steps >= args.max_train_steps: break @@ -543,6 +604,16 @@ def group_texts(examples): logger.info(f"epoch {epoch}: perplexity: {perplexity}") + if args.with_tracking: + accelerator.log( + { + "perplexity": perplexity, + "train_loss": total_loss, + "epoch": epoch, + }, + step=completed_steps, + ) + if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) @@ -552,6 +623,9 @@ def group_texts(examples): repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) + + if args.checkpointing_steps == "epoch": + accelerator.save_state(f"epoch_{epoch}") if args.output_dir is not None: accelerator.wait_for_everyone() diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 451f2fc17b89..1922e5f94c03 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -177,6 +177,23 @@ def parse_args(): "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + required=False, + help="Whether to load in all available experiment trackers from the environment and use them for logging.", + ) args = parser.parse_args() if args.push_to_hub: @@ -246,7 +263,8 @@ def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment + accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -431,14 +449,6 @@ def preprocess_function(examples): device = accelerator.device model.to(device) - # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader - ) - - # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be - # shorter in multiprocess) - # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: @@ -453,6 +463,23 @@ def preprocess_function(examples): num_training_steps=args.max_train_steps, ) + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + # Figure out how many steps we should save the Accelerator states + if hasattr(args.checkpointing_steps, "isdigit"): + checkpointing_steps = args.checkpointing_steps + if args.checkpointing_steps.isdigit(): + checkpointing_steps = int(args.checkpointing_steps) + else: + checkpointing_steps = None + + # We need to initialize the trackers we use, and also store our configuration + if args.with_tracking: + accelerator.init_trackers("clm_no_trainer", args) + # Metrics metric = load_metric("accuracy") @@ -470,11 +497,38 @@ def preprocess_function(examples): progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") + accelerator.load_state(args.resume_from_checkpoint) + resume_step = None + path = args.resume_from_checkpoint + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + if "epoch" in path: + args.num_train_epochs -= int(path.replace("epoch_", "")) + else: + resume_step = int(path.replace("step_", "")) + args.num_train_epochs -= resume_step // len(train_dataloader) + resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + for epoch in range(args.num_train_epochs): model.train() + if args.with_tracking: + total_loss = 0 for step, batch in enumerate(train_dataloader): + # We need to skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == 0 and step < resume_step: + continue outputs = model(**batch) loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: @@ -484,6 +538,10 @@ def preprocess_function(examples): progress_bar.update(1) completed_steps += 1 + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + accelerator.save_state(f"step_{completed_steps}") + if completed_steps >= args.max_train_steps: break @@ -500,6 +558,16 @@ def preprocess_function(examples): eval_metric = metric.compute() accelerator.print(f"epoch {epoch}: {eval_metric}") + if args.with_tracking: + accelerator.log( + { + "accuracy": eval_metric, + "train_loss": total_loss, + "epoch": epoch, + }, + step=completed_steps, + ) + if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) From 58be84c28a03cf1fa5cbc30e1f2e071e50eb0547 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Wed, 6 Apr 2022 11:05:37 -0400 Subject: [PATCH 5/8] qa examples --- .../language-modeling/run_mlm_no_trainer.py | 16 ++-- .../multiple-choice/run_swag_no_trainer.py | 14 +-- .../run_qa_beam_search_no_trainer.py | 90 +++++++++++++++++-- .../question-answering/run_qa_no_trainer.py | 90 +++++++++++++++++-- 4 files changed, 177 insertions(+), 33 deletions(-) diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 7d04867ab0de..6a3b48c3b1c2 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -606,13 +606,13 @@ def group_texts(examples): if args.with_tracking: accelerator.log( - { - "perplexity": perplexity, - "train_loss": total_loss, - "epoch": epoch, - }, - step=completed_steps, - ) + { + "perplexity": perplexity, + "train_loss": total_loss, + "epoch": epoch, + }, + step=completed_steps, + ) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() @@ -623,7 +623,7 @@ def group_texts(examples): repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) - + if args.checkpointing_steps == "epoch": accelerator.save_state(f"epoch_{epoch}") diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 1922e5f94c03..d97fb71f395c 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -560,13 +560,13 @@ def preprocess_function(examples): if args.with_tracking: accelerator.log( - { - "accuracy": eval_metric, - "train_loss": total_loss, - "epoch": epoch, - }, - step=completed_steps, - ) + { + "accuracy": eval_metric, + "train_loss": total_loss, + "epoch": epoch, + }, + step=completed_steps, + ) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 939724bd60e3..19f8c301e6d2 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -210,6 +210,23 @@ def parse_args(): "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + required=False, + help="Whether to load in all available experiment trackers from the environment and use them for logging.", + ) args = parser.parse_args() # Sanity checks @@ -241,7 +258,8 @@ def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment + accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -670,14 +688,6 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) - # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader - ) - - # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be - # shorter in multiprocess) - # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: @@ -692,6 +702,23 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): num_training_steps=args.max_train_steps, ) + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + # Figure out how many steps we should save the Accelerator states + if hasattr(args.checkpointing_steps, "isdigit"): + checkpointing_steps = args.checkpointing_steps + if args.checkpointing_steps.isdigit(): + checkpointing_steps = int(args.checkpointing_steps) + else: + checkpointing_steps = None + + # We need to initialize the trackers we use, and also store our configuration + if args.with_tracking: + accelerator.init_trackers("clm_no_trainer", args) + # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -707,11 +734,38 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") + accelerator.load_state(args.resume_from_checkpoint) + resume_step = None + path = args.resume_from_checkpoint + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + if "epoch" in path: + args.num_train_epochs -= int(path.replace("epoch_", "")) + else: + resume_step = int(path.replace("step_", "")) + args.num_train_epochs -= resume_step // len(train_dataloader) + resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + for epoch in range(args.num_train_epochs): model.train() + if args.with_tracking: + total_loss = 0 for step, batch in enumerate(train_dataloader): + # We need to skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == 0 and step < resume_step: + continue outputs = model(**batch) loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: @@ -721,6 +775,10 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): progress_bar.update(1) completed_steps += 1 + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + accelerator.save_state(f"step_{completed_steps}") + if completed_steps >= args.max_train_steps: break @@ -847,6 +905,20 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Predict metrics: {predict_metric}") + if args.with_tracking: + log = { + "squad_v2" if args.version_2_with_negative else "squad": eval_metric, + "train_loss": total_loss, + "epoch": epoch, + } + if args.do_predict: + log["squad_v2_predict" if args.version_2_with_negative else "squad_predict"] = predict_metric + + accelerator.log(log, step=completed_steps) + + if args.checkpointing_steps == "epoch": + accelerator.save_state(f"epoch_{epoch}") + if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 2129ed97678f..f37d2de59559 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -239,6 +239,23 @@ def parse_args(): "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + required=False, + help="Whether to load in all available experiment trackers from the environment and use them for logging.", + ) args = parser.parse_args() # Sanity checks @@ -270,7 +287,8 @@ def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment + accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -676,14 +694,6 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) - # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader - ) - - # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be - # shorter in multiprocess) - # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: @@ -698,6 +708,23 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): num_training_steps=args.max_train_steps, ) + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + # Figure out how many steps we should save the Accelerator states + if hasattr(args.checkpointing_steps, "isdigit"): + checkpointing_steps = args.checkpointing_steps + if args.checkpointing_steps.isdigit(): + checkpointing_steps = int(args.checkpointing_steps) + else: + checkpointing_steps = None + + # We need to initialize the trackers we use, and also store our configuration + if args.with_tracking: + accelerator.init_trackers("clm_no_trainer", args) + # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -713,11 +740,38 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") + accelerator.load_state(args.resume_from_checkpoint) + resume_step = None + path = args.resume_from_checkpoint + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + if "epoch" in path: + args.num_train_epochs -= int(path.replace("epoch_", "")) + else: + resume_step = int(path.replace("step_", "")) + args.num_train_epochs -= resume_step // len(train_dataloader) + resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + for epoch in range(args.num_train_epochs): model.train() + if args.with_tracking: + total_loss = 0 for step, batch in enumerate(train_dataloader): + # We need to skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == 0 and step < resume_step: + continue outputs = model(**batch) loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: @@ -727,6 +781,10 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): progress_bar.update(1) completed_steps += 1 + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + accelerator.save_state(f"step_{completed_steps}") + if completed_steps >= args.max_train_steps: break @@ -810,6 +868,20 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Predict metrics: {predict_metric}") + if args.with_tracking: + log = { + "squad_v2" if args.version_2_with_negative else "squad": eval_metric, + "train_loss": total_loss, + "epoch": epoch, + } + if args.do_predict: + log["squad_v2_predict" if args.version_2_with_negative else "squad_predict"] = predict_metric + + accelerator.log(log, step=completed_steps) + + if args.checkpointing_steps == "epoch": + accelerator.save_state(f"epoch_{epoch}") + if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) From 66a5739f15ea97e7a81242519e83e20bbb2552de Mon Sep 17 00:00:00 2001 From: muellerzr Date: Wed, 6 Apr 2022 11:43:22 -0400 Subject: [PATCH 6/8] Summarization --- .../run_summarization_no_trainer.py | 83 +++++++++++++++++-- 1 file changed, 74 insertions(+), 9 deletions(-) diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index d9b8657d261b..fd2bb2cc8162 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -262,6 +262,23 @@ def parse_args(): "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + required=False, + help="Whether to load in all available experiment trackers from the environment and use them for logging.", + ) args = parser.parse_args() # Sanity checks @@ -296,7 +313,8 @@ def main(): "`--source_prefix 'summarize: ' `" ) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment + accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -494,14 +512,6 @@ def postprocess_text(preds, labels): ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) - # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader - ) - - # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be - # shorter in multiprocess) - # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: @@ -516,6 +526,23 @@ def postprocess_text(preds, labels): num_training_steps=args.max_train_steps, ) + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + # Figure out how many steps we should save the Accelerator states + if hasattr(args.checkpointing_steps, "isdigit"): + checkpointing_steps = args.checkpointing_steps + if args.checkpointing_steps.isdigit(): + checkpointing_steps = int(args.checkpointing_steps) + else: + checkpointing_steps = None + + # We need to initialize the trackers we use, and also store our configuration + if args.with_tracking: + accelerator.init_trackers("summarization_no_trainer", args) + # Metric metric = load_metric("rouge") @@ -532,12 +559,38 @@ def postprocess_text(preds, labels): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") + accelerator.load_state(args.resume_from_checkpoint) + resume_step = None + path = args.resume_from_checkpoint + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + if "epoch" in path: + args.num_train_epochs -= int(path.replace("epoch_", "")) + else: + resume_step = int(path.replace("step_", "")) + args.num_train_epochs -= resume_step // len(train_dataloader) + resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step for epoch in range(args.num_train_epochs): model.train() + if args.with_tracking: + total_loss = 0 for step, batch in enumerate(train_dataloader): + # We need to skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == 0 and step < resume_step: + continue outputs = model(**batch) loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: @@ -547,6 +600,10 @@ def postprocess_text(preds, labels): progress_bar.update(1) completed_steps += 1 + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + accelerator.save_state(f"step_{completed_steps}") + if completed_steps >= args.max_train_steps: break @@ -596,6 +653,11 @@ def postprocess_text(preds, labels): logger.info(result) + if args.with_tracking: + result["train_loss"] = total_loss + result["epoch"] = epoch + accelerator.log(result, step=completed_steps) + if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) @@ -606,6 +668,9 @@ def postprocess_text(preds, labels): commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) + if args.checkpointing_steps == "epoch": + accelerator.save_state(f"epoch_{epoch}") + if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) From 270bed1d1358262115349d6158ee40f9ad16d6a2 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Wed, 6 Apr 2022 11:52:00 -0400 Subject: [PATCH 7/8] Token and text classification --- .../run_glue_no_trainer.py | 89 +++++++++++++++++-- .../run_ner_no_trainer.py | 87 ++++++++++++++++-- 2 files changed, 158 insertions(+), 18 deletions(-) diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 4dac18206b47..5bd1d1fa1e5c 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -150,6 +150,24 @@ def parse_args(): "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + required=False, + help="Whether to load in all available experiment trackers from the environment and use them for logging.", + ) args = parser.parse_args() # Sanity checks @@ -173,7 +191,8 @@ def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment + accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -376,14 +395,6 @@ def preprocess_function(examples): ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) - # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader - ) - - # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be - # shorter in multiprocess) - # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: @@ -398,6 +409,23 @@ def preprocess_function(examples): num_training_steps=args.max_train_steps, ) + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + # Figure out how many steps we should save the Accelerator states + if hasattr(args.checkpointing_steps, "isdigit"): + checkpointing_steps = args.checkpointing_steps + if args.checkpointing_steps.isdigit(): + checkpointing_steps = int(args.checkpointing_steps) + else: + checkpointing_steps = None + + # We need to initialize the trackers we use, and also store our configuration + if args.with_tracking: + accelerator.init_trackers("glue_no_trainer", args) + # Get the metric function if args.task_name is not None: metric = load_metric("glue", args.task_name) @@ -417,12 +445,38 @@ def preprocess_function(examples): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") + accelerator.load_state(args.resume_from_checkpoint) + resume_step = None + path = args.resume_from_checkpoint + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + if "epoch" in path: + args.num_train_epochs -= int(path.replace("epoch_", "")) + else: + resume_step = int(path.replace("step_", "")) + args.num_train_epochs -= resume_step // len(train_dataloader) + resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step for epoch in range(args.num_train_epochs): model.train() + if args.with_tracking: + total_loss = 0 for step, batch in enumerate(train_dataloader): + # We need to skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == 0 and step < resume_step: + continue outputs = model(**batch) loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: @@ -432,6 +486,10 @@ def preprocess_function(examples): progress_bar.update(1) completed_steps += 1 + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + accelerator.save_state(f"step_{completed_steps}") + if completed_steps >= args.max_train_steps: break @@ -447,6 +505,16 @@ def preprocess_function(examples): eval_metric = metric.compute() logger.info(f"epoch {epoch}: {eval_metric}") + if args.with_tracking: + accelerator.log( + { + "accuracy" if args.task_name is not None else "glue": eval_metric, + "train_loss": total_loss, + "epoch": epoch, + }, + step=completed_steps, + ) + if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) @@ -457,6 +525,9 @@ def preprocess_function(examples): commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) + if args.checkpointing_steps == "epoch": + accelerator.save_state(f"epoch_{epoch}") + if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 90b01d6deb16..57d3ceee905d 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -204,6 +204,23 @@ def parse_args(): "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + required=False, + help="Whether to load in all available experiment trackers from the environment and use them for logging.", + ) args = parser.parse_args() # Sanity checks @@ -227,7 +244,8 @@ def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment + accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -491,14 +509,6 @@ def tokenize_and_align_labels(examples): device = accelerator.device model.to(device) - # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader - ) - - # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be - # shorter in multiprocess) - # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: @@ -513,6 +523,23 @@ def tokenize_and_align_labels(examples): num_training_steps=args.max_train_steps, ) + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + # Figure out how many steps we should save the Accelerator states + if hasattr(args.checkpointing_steps, "isdigit"): + checkpointing_steps = args.checkpointing_steps + if args.checkpointing_steps.isdigit(): + checkpointing_steps = int(args.checkpointing_steps) + else: + checkpointing_steps = None + + # We need to initialize the trackers we use, and also store our configuration + if args.with_tracking: + accelerator.init_trackers("clm_no_trainer", args) + # Metrics metric = load_metric("seqeval") @@ -569,12 +596,38 @@ def compute_metrics(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") + accelerator.load_state(args.resume_from_checkpoint) + resume_step = None + path = args.resume_from_checkpoint + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + if "epoch" in path: + args.num_train_epochs -= int(path.replace("epoch_", "")) + else: + resume_step = int(path.replace("step_", "")) + args.num_train_epochs -= resume_step // len(train_dataloader) + resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step for epoch in range(args.num_train_epochs): model.train() + if args.with_tracking: + total_loss = 0 for step, batch in enumerate(train_dataloader): + # We need to skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == 0 and step < resume_step: + continue outputs = model(**batch) loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: @@ -584,6 +637,10 @@ def compute_metrics(): progress_bar.update(1) completed_steps += 1 + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + accelerator.save_state(f"step_{completed_steps}") + if completed_steps >= args.max_train_steps: break @@ -608,6 +665,15 @@ def compute_metrics(): # eval_metric = metric.compute() eval_metric = compute_metrics() accelerator.print(f"epoch {epoch}:", eval_metric) + if args.with_tracking: + accelerator.log( + { + "seqeval": eval_metric, + "train_loss": total_loss, + "epoch": epoch, + }, + step=completed_steps, + ) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() @@ -619,6 +685,9 @@ def compute_metrics(): commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) + if args.checkpointing_steps == "epoch": + accelerator.save_state(f"epoch_{epoch}") + if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) From 2d94583f4f238a1a6b2e2ecdb1794829a9cf2d33 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Wed, 6 Apr 2022 11:59:56 -0400 Subject: [PATCH 8/8] Translation --- .../translation/run_translation_no_trainer.py | 89 +++++++++++++++++-- 1 file changed, 80 insertions(+), 9 deletions(-) diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 34c1bcd0e72f..bf7e15ae4dd1 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -243,6 +243,23 @@ def parse_args(): "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + required=False, + help="Whether to load in all available experiment trackers from the environment and use them for logging.", + ) args = parser.parse_args() # Sanity checks @@ -268,7 +285,8 @@ def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator() + # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment + accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( @@ -472,14 +490,6 @@ def preprocess_function(examples): ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) - # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader - ) - - # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be - # shorter in multiprocess) - # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: @@ -494,6 +504,23 @@ def preprocess_function(examples): num_training_steps=args.max_train_steps, ) + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + # Figure out how many steps we should save the Accelerator states + if hasattr(args.checkpointing_steps, "isdigit"): + checkpointing_steps = args.checkpointing_steps + if args.checkpointing_steps.isdigit(): + checkpointing_steps = int(args.checkpointing_steps) + else: + checkpointing_steps = None + + # We need to initialize the trackers we use, and also store our configuration + if args.with_tracking: + accelerator.init_trackers("translation_no_trainer", args) + metric = load_metric("sacrebleu") def postprocess_text(preds, labels): @@ -516,11 +543,38 @@ def postprocess_text(preds, labels): progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") + accelerator.load_state(args.resume_from_checkpoint) + resume_step = None + path = args.resume_from_checkpoint + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + if "epoch" in path: + args.num_train_epochs -= int(path.replace("epoch_", "")) + else: + resume_step = int(path.replace("step_", "")) + args.num_train_epochs -= resume_step // len(train_dataloader) + resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + for epoch in range(args.num_train_epochs): model.train() + if args.with_tracking: + total_loss = 0 for step, batch in enumerate(train_dataloader): + # We need to skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == 0 and step < resume_step: + continue outputs = model(**batch) loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: @@ -530,6 +584,10 @@ def postprocess_text(preds, labels): progress_bar.update(1) completed_steps += 1 + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + accelerator.save_state(f"step_{completed_steps}") + if completed_steps >= args.max_train_steps: break @@ -574,6 +632,16 @@ def postprocess_text(preds, labels): eval_metric = metric.compute() logger.info({"bleu": eval_metric["score"]}) + if args.with_tracking: + accelerator.log( + { + "blue": eval_metric["score"], + "train_loss": total_loss, + "epoch": epoch, + }, + step=completed_steps, + ) + if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) @@ -584,6 +652,9 @@ def postprocess_text(preds, labels): commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) + if args.checkpointing_steps == "epoch": + accelerator.save_state(f"epoch_{epoch}") + if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model)