diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py index b5b7f66c05..37bfdf4760 100644 --- a/examples/question-answering/run_qa.py +++ b/examples/question-answering/run_qa.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.18.0") +check_min_version("4.19.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/question-answering/utils_qa.py b/examples/question-answering/utils_qa.py index 13baf925f7..8651293efb 100644 --- a/examples/question-answering/utils_qa.py +++ b/examples/question-answering/utils_qa.py @@ -156,7 +156,7 @@ def postprocess_qa_predictions( "end_logit": end_logits[end_index], } ) - if version_2_with_negative: + if version_2_with_negative and min_null_prediction is not None: # Add the minimum null prediction prelim_predictions.append(min_null_prediction) null_score = min_null_prediction["score"] @@ -165,7 +165,11 @@ def postprocess_qa_predictions( predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] # Add back the minimum null prediction if it was removed because of its low score. - if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions): + if ( + version_2_with_negative + and min_null_prediction is not None + and not any(p["offsets"] == (0, 0) for p in predictions) + ): predictions.append(min_null_prediction) # Use the offsets to gather the answer text in the original context. @@ -347,9 +351,12 @@ def postprocess_qa_predictions_with_beam_search( start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None + or len(offset_mapping[start_index]) < 2 or offset_mapping[end_index] is None + or len(offset_mapping[end_index]) < 2 ): continue + # Don't consider answers with a length negative or > max_answer_length. if end_index < start_index or end_index - start_index + 1 > max_answer_length: continue @@ -378,7 +385,9 @@ def postprocess_qa_predictions_with_beam_search( # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid # failure. if len(predictions) == 0: - predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6}) + # Without predictions min_null_score is going to be None and None will cause an exception later + min_null_score = -2e-6 + predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": min_null_score}) # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using # the LogSumExp trick). diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py index daf261cbbe..ae7c71bd83 100755 --- a/examples/text-classification/run_glue.py +++ b/examples/text-classification/run_glue.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.18.0") +check_min_version("4.19.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/optimum/habana/trainer.py b/optimum/habana/trainer.py index ec1a5cd7c6..3268b043ea 100644 --- a/optimum/habana/trainer.py +++ b/optimum/habana/trainer.py @@ -63,6 +63,8 @@ PredictionOutput, TrainOutput, denumpify_detensorize, + enable_full_determinism, + find_executable_batch_size, get_last_checkpoint, has_length, set_seed, @@ -300,7 +302,8 @@ def train( kwargs: Additional keyword arguments used to hide deprecated arguments """ - resume_from_checkpoint = None if not resume_from_checkpoint else resume_from_checkpoint + if resume_from_checkpoint is False: + resume_from_checkpoint = None # memory metrics - must set up as early as possible self._memory_tracker.start() @@ -325,7 +328,7 @@ def train( model_reloaded = False if self.model_init is not None: # Seed must be set before instantiating the model when using model_init. - set_seed(args.seed) + enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed) self.model = self.call_model_init(trial) model_reloaded = True # Reinitializes optimizer and scheduler @@ -350,28 +353,7 @@ def train( raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})") if resume_from_checkpoint is not None: - if not os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): - raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") - - logger.info(f"Loading model from {resume_from_checkpoint}).") - - if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)): - config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME)) - checkpoint_version = config.transformers_version - if checkpoint_version is not None and checkpoint_version != __version__: - logger.warning( - f"You are resuming training from a checkpoint trained with {checkpoint_version} of " - f"Transformers but your current version is {__version__}. This is not recommended and could " - "yield to errors or unwanted behaviors." - ) - - # We load the model state dict on the CPU to avoid an OOM error. - state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu") - # If the model is on the GPU, it still works! - self._load_state_dict_in_model(state_dict) - - # release memory - del state_dict + self._load_from_checkpoint(resume_from_checkpoint) # If model was re-initialized, put it on the right device and update self.model_wrapped if model_reloaded: @@ -379,6 +361,20 @@ def train( self._move_model_to_device(self.model, args.device) self.model_wrapped = self.model + inner_training_loop = find_executable_batch_size( + self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size + ) + return inner_training_loop( + args=args, + resume_from_checkpoint=resume_from_checkpoint, + trial=trial, + ignore_keys_for_eval=ignore_keys_for_eval, + ) + + def _inner_training_loop( + self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None + ): + self._train_batch_size = batch_size # Data loader and number of training steps train_dataloader = self.get_train_dataloader() @@ -546,6 +542,9 @@ def train( ) self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) + if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: + self._load_rng_state(resume_from_checkpoint) + step = -1 for step, inputs in enumerate(epoch_iterator): if args.throughput_warmup_steps > 0 and args.throughput_warmup_steps == epoch * steps_in_epoch + step: @@ -665,21 +664,7 @@ def train( if args.local_rank != -1: torch.distributed.barrier() - logger.info( - f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." - ) - - best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME) - if os.path.exists(best_model_path): - # We load the model state dict on the CPU to avoid an OOM error. - state_dict = torch.load(best_model_path, map_location="cpu") - # If the model is on the GPU, it still works! - self._load_state_dict_in_model(state_dict) - else: - logger.warning( - f"Could not locate the best model at {best_model_path}, if you are running a distributed training " - "on multiple nodes, you should activate `--save_on_each_node`." - ) + self._load_best_model() # add remaining tr_loss self._total_loss_scalar += tr_loss.item() @@ -840,7 +825,7 @@ def evaluation_loop( model = self._wrap_model(self.model, training=False) - batch_size = self.args.per_device_eval_batch_size + batch_size = self.args.eval_batch_size logger.info(f"***** Running {description} *****") if has_length(dataloader): @@ -863,10 +848,13 @@ def evaluation_loop( losses_host = None preds_host = None labels_host = None + inputs_host = None + # losses/preds/labels on CPU (final containers) all_losses = None all_preds = None all_labels = None + all_inputs = None # Will be useful when we have an iterable dataset so don't know its length. observed_num_examples = 0 @@ -882,6 +870,7 @@ def evaluation_loop( # Prediction step loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) + inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None # Update containers on host if loss is not None: @@ -891,6 +880,14 @@ def evaluation_loop( labels = self._pad_across_processes(labels) labels = self._nested_gather(labels) labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + if inputs_decode is not None: + inputs_decode = self._pad_across_processes(inputs_decode) + inputs_decode = self._nested_gather(inputs_decode) + inputs_host = ( + inputs_decode + if inputs_host is None + else nested_concat(inputs_host, inputs_decode, padding_index=-100) + ) if logits is not None: logits = self._pad_across_processes(logits) logits = self._nested_gather(logits) @@ -908,6 +905,13 @@ def evaluation_loop( preds_host = to_device_dtype(preds_host, target_dtype=torch.float32) logits = nested_numpify(preds_host) all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + if inputs_host is not None: + inputs_decode = nested_numpify(inputs_host) + all_inputs = ( + inputs_decode + if all_inputs is None + else nested_concat(all_inputs, inputs_decode, padding_index=-100) + ) if labels_host is not None: labels = nested_numpify(labels_host) all_labels = ( @@ -915,7 +919,7 @@ def evaluation_loop( ) # Set back to None to begin a new accumulation - losses_host, preds_host, labels_host = None, None, None + losses_host, preds_host, inputs_host, labels_host = None, None, None, None # nested concat does accumulation on tensors of variable length. # Added mark step here to avoid graph recompile @@ -935,6 +939,11 @@ def evaluation_loop( preds_host = to_device_dtype(preds_host, target_dtype=torch.float32) logits = nested_numpify(preds_host) all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + if inputs_host is not None: + inputs_decode = nested_numpify(inputs_host) + all_inputs = ( + inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100) + ) if labels_host is not None: labels = nested_numpify(labels_host) all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) @@ -960,10 +969,17 @@ def evaluation_loop( all_preds = nested_truncate(all_preds, num_samples) if all_labels is not None: all_labels = nested_truncate(all_labels, num_samples) + if all_inputs is not None: + all_inputs = nested_truncate(all_inputs, num_samples) # Metrics! if self.compute_metrics is not None and all_preds is not None and all_labels is not None: - metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) + if args.include_inputs_for_metrics: + metrics = self.compute_metrics( + EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs) + ) + else: + metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) else: metrics = {} @@ -1085,6 +1101,7 @@ def prediction_loop( losses_host: torch.Tensor = None preds_host: Union[torch.Tensor, List[torch.Tensor]] = None labels_host: Union[torch.Tensor, List[torch.Tensor]] = None + inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None world_size = max(1, args.world_size) @@ -1097,6 +1114,7 @@ def prediction_loop( make_multiple_of = dataloader.sampler.batch_size preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) + inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) model.eval() @@ -1107,6 +1125,8 @@ def prediction_loop( for step, inputs in enumerate(dataloader): loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) + inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None + if loss is not None: losses = loss.repeat(batch_size) losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) @@ -1114,6 +1134,12 @@ def prediction_loop( preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) if labels is not None: labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + if inputs_decode is not None: + inputs_host = ( + inputs_decode + if inputs_host is None + else nested_concat(inputs_host, inputs_decode, padding_index=-100) + ) self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. @@ -1122,9 +1148,10 @@ def prediction_loop( if not prediction_loss_only: preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) + inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids")) # Set back to None to begin a new accumulation - losses_host, preds_host, labels_host = None, None, None + losses_host, preds_host, labels_host, inputs_host = None, None, None, None # nested concat does accumulation on tensors of variable length. # Added mark step here to avoid graph recompile @@ -1140,13 +1167,20 @@ def prediction_loop( if not prediction_loss_only: preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) + inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids")) eval_loss = eval_losses_gatherer.finalize() preds = preds_gatherer.finalize() if not prediction_loss_only else None label_ids = labels_gatherer.finalize() if not prediction_loss_only else None + inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None if self.compute_metrics is not None and preds is not None and label_ids is not None: - metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) + if args.include_inputs_for_metrics: + metrics = self.compute_metrics( + EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids) + ) + else: + metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} diff --git a/optimum/habana/training_args.py b/optimum/habana/training_args.py index e3b20a25a8..8889e098e0 100644 --- a/optimum/habana/training_args.py +++ b/optimum/habana/training_args.py @@ -135,7 +135,7 @@ def __str__(self): @torch_required def _setup_devices(self) -> "torch.device": logger.info("PyTorch: setting up devices") - if torch.distributed.is_initialized() and self.local_rank == -1: + if torch.distributed.is_available() and torch.distributed.is_initialized() and self.local_rank == -1: logger.warning("torch.distributed process group is initialized, but local_rank == -1. ") if self.no_cuda: device = torch.device("cpu") diff --git a/optimum/habana/version.py b/optimum/habana/version.py index be891fbc7d..ddbb952843 100644 --- a/optimum/habana/version.py +++ b/optimum/habana/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.0.1" +__version__ = "1.1.0dev0" diff --git a/setup.py b/setup.py index 33412e10d6..10c08b6b1a 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ INSTALL_REQUIRES = [ - "transformers == 4.18.0", + "transformers == 4.19.1", "optimum", "datasets", "tokenizers", @@ -21,6 +21,7 @@ "sentencepiece", "scipy", "pillow", + "huggingface_hub==0.4.0", ] TESTS_REQUIRE = [ @@ -51,7 +52,7 @@ "Programming Language :: Python :: 3.9", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], - keywords="transformers, quantization, fine-tuning, gaudi, hpu", + keywords="transformers, mixed-precision training, fine-tuning, gaudi, hpu", url="https://huggingface.co/hardware/habana", author="HuggingFace Inc. Special Ops Team", author_email="hardware@huggingface.co", diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 635bccdaaa..b93118da30 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -122,11 +122,12 @@ def __call__(self, eval_pred): class RegressionModelConfig(PretrainedConfig): - def __init__(self, a=0, b=0, double_output=False, **kwargs): + def __init__(self, a=0, b=0, double_output=False, random_torch=True, **kwargs): super().__init__(**kwargs) self.a = a self.b = b self.double_output = double_output + self.random_torch = random_torch self.hidden_size = 1 @@ -224,14 +225,18 @@ def __init__(self, config): super().__init__(config) self.a = nn.Parameter(torch.tensor(config.a).float()) self.b = nn.Parameter(torch.tensor(config.b).float()) + self.random_torch = config.random_torch def forward(self, input_x, labels=None, **kwargs): y = input_x * self.a + self.b - torch_rand = torch.randn(1).squeeze() + if self.random_torch: + torch_rand = torch.randn(1).squeeze() np_rand = np.random.rand() rand_rand = random.random() - y += 0.05 * torch_rand + 0.05 * torch.tensor(np_rand + rand_rand) + if self.random_torch: + y += 0.05 * torch_rand + y += 0.05 * torch.tensor(np_rand + rand_rand) if labels is None: return (y,) @@ -1200,7 +1205,8 @@ def test_load_best_model_at_end(self): def test_training_iterable_dataset(self): config = RegressionModelConfig() model = RegressionPreTrainedModel(config) - train_dataset = SampleIterableDataset() + # Adding one column not used by the model should have no impact + train_dataset = SampleIterableDataset(label_names=["labels", "extra"]) args = RegressionGaudiTrainingArguments( output_dir="./examples", max_steps=4, use_habana=True, use_lazy_mode=True @@ -1241,7 +1247,8 @@ def test_training_finite_iterable_dataset(self): # def test_evaluation_iterable_dataset(self): # config = RegressionModelConfig(a=1.5, b=2.5) # model = RegressionPreTrainedModel(config) - # eval_dataset = SampleIterableDataset() + # # Adding one column not used by the model should have no impact + # eval_dataset = SampleIterableDataset(label_names=["labels", "extra"]) # args = RegressionGaudiTrainingArguments(output_dir="./examples", use_habana=True, use_lazy_mode=True) # gaudi_config = get_gaudi_config() @@ -1292,7 +1299,8 @@ def test_predict_iterable_dataset(self): self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) # With a number of elements not a round multiple of the batch size - test_dataset = SampleIterableDataset(length=66) + # Adding one column not used by the model should have no impact + test_dataset = SampleIterableDataset(length=66, label_names=["labels", "extra"]) preds = trainer.predict(test_dataset).predictions x = test_dataset.dataset.x self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))