Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/question-answering/run_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.18.0")
check_min_version("4.19.0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

Expand Down
15 changes: 12 additions & 3 deletions examples/question-answering/utils_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def postprocess_qa_predictions(
"end_logit": end_logits[end_index],
}
)
if version_2_with_negative:
if version_2_with_negative and min_null_prediction is not None:
# Add the minimum null prediction
prelim_predictions.append(min_null_prediction)
null_score = min_null_prediction["score"]
Expand All @@ -165,7 +165,11 @@ def postprocess_qa_predictions(
predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]

# Add back the minimum null prediction if it was removed because of its low score.
if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
if (
version_2_with_negative
and min_null_prediction is not None
and not any(p["offsets"] == (0, 0) for p in predictions)
):
predictions.append(min_null_prediction)

# Use the offsets to gather the answer text in the original context.
Expand Down Expand Up @@ -347,9 +351,12 @@ def postprocess_qa_predictions_with_beam_search(
start_index >= len(offset_mapping)
or end_index >= len(offset_mapping)
or offset_mapping[start_index] is None
or len(offset_mapping[start_index]) < 2
or offset_mapping[end_index] is None
or len(offset_mapping[end_index]) < 2
):
continue

# Don't consider answers with a length negative or > max_answer_length.
if end_index < start_index or end_index - start_index + 1 > max_answer_length:
continue
Expand Down Expand Up @@ -378,7 +385,9 @@ def postprocess_qa_predictions_with_beam_search(
# In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
# failure.
if len(predictions) == 0:
predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6})
# Without predictions min_null_score is going to be None and None will cause an exception later
min_null_score = -2e-6
predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": min_null_score})

# Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
# the LogSumExp trick).
Expand Down
2 changes: 1 addition & 1 deletion examples/text-classification/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.18.0")
check_min_version("4.19.0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

Expand Down
122 changes: 78 additions & 44 deletions optimum/habana/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
PredictionOutput,
TrainOutput,
denumpify_detensorize,
enable_full_determinism,
find_executable_batch_size,
get_last_checkpoint,
has_length,
set_seed,
Expand Down Expand Up @@ -300,7 +302,8 @@ def train(
kwargs:
Additional keyword arguments used to hide deprecated arguments
"""
resume_from_checkpoint = None if not resume_from_checkpoint else resume_from_checkpoint
if resume_from_checkpoint is False:
resume_from_checkpoint = None

# memory metrics - must set up as early as possible
self._memory_tracker.start()
Expand All @@ -325,7 +328,7 @@ def train(
model_reloaded = False
if self.model_init is not None:
# Seed must be set before instantiating the model when using model_init.
set_seed(args.seed)
enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
self.model = self.call_model_init(trial)
model_reloaded = True
# Reinitializes optimizer and scheduler
Expand All @@ -350,35 +353,28 @@ def train(
raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})")

if resume_from_checkpoint is not None:
if not os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)):
raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")

logger.info(f"Loading model from {resume_from_checkpoint}).")

if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)):
config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME))
checkpoint_version = config.transformers_version
if checkpoint_version is not None and checkpoint_version != __version__:
logger.warning(
f"You are resuming training from a checkpoint trained with {checkpoint_version} of "
f"Transformers but your current version is {__version__}. This is not recommended and could "
"yield to errors or unwanted behaviors."
)

# We load the model state dict on the CPU to avoid an OOM error.
state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu")
# If the model is on the GPU, it still works!
self._load_state_dict_in_model(state_dict)

# release memory
del state_dict
self._load_from_checkpoint(resume_from_checkpoint)

# If model was re-initialized, put it on the right device and update self.model_wrapped
if model_reloaded:
if self.place_model_on_device:
self._move_model_to_device(self.model, args.device)
self.model_wrapped = self.model

inner_training_loop = find_executable_batch_size(
self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
)
return inner_training_loop(
args=args,
resume_from_checkpoint=resume_from_checkpoint,
trial=trial,
ignore_keys_for_eval=ignore_keys_for_eval,
)

def _inner_training_loop(
self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
):
self._train_batch_size = batch_size
# Data loader and number of training steps
train_dataloader = self.get_train_dataloader()

Expand Down Expand Up @@ -546,6 +542,9 @@ def train(
)
self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)

if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0:
self._load_rng_state(resume_from_checkpoint)

step = -1
for step, inputs in enumerate(epoch_iterator):
if args.throughput_warmup_steps > 0 and args.throughput_warmup_steps == epoch * steps_in_epoch + step:
Expand Down Expand Up @@ -665,21 +664,7 @@ def train(
if args.local_rank != -1:
torch.distributed.barrier()

logger.info(
f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})."
)

best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)
if os.path.exists(best_model_path):
# We load the model state dict on the CPU to avoid an OOM error.
state_dict = torch.load(best_model_path, map_location="cpu")
# If the model is on the GPU, it still works!
self._load_state_dict_in_model(state_dict)
else:
logger.warning(
f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
"on multiple nodes, you should activate `--save_on_each_node`."
)
self._load_best_model()

# add remaining tr_loss
self._total_loss_scalar += tr_loss.item()
Expand Down Expand Up @@ -840,7 +825,7 @@ def evaluation_loop(

model = self._wrap_model(self.model, training=False)

batch_size = self.args.per_device_eval_batch_size
batch_size = self.args.eval_batch_size

logger.info(f"***** Running {description} *****")
if has_length(dataloader):
Expand All @@ -863,10 +848,13 @@ def evaluation_loop(
losses_host = None
preds_host = None
labels_host = None
inputs_host = None

# losses/preds/labels on CPU (final containers)
all_losses = None
all_preds = None
all_labels = None
all_inputs = None
# Will be useful when we have an iterable dataset so don't know its length.

observed_num_examples = 0
Expand All @@ -882,6 +870,7 @@ def evaluation_loop(

# Prediction step
loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None

# Update containers on host
if loss is not None:
Expand All @@ -891,6 +880,14 @@ def evaluation_loop(
labels = self._pad_across_processes(labels)
labels = self._nested_gather(labels)
labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
if inputs_decode is not None:
inputs_decode = self._pad_across_processes(inputs_decode)
inputs_decode = self._nested_gather(inputs_decode)
inputs_host = (
inputs_decode
if inputs_host is None
else nested_concat(inputs_host, inputs_decode, padding_index=-100)
)
if logits is not None:
logits = self._pad_across_processes(logits)
logits = self._nested_gather(logits)
Expand All @@ -908,14 +905,21 @@ def evaluation_loop(
preds_host = to_device_dtype(preds_host, target_dtype=torch.float32)
logits = nested_numpify(preds_host)
all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
if inputs_host is not None:
inputs_decode = nested_numpify(inputs_host)
all_inputs = (
inputs_decode
if all_inputs is None
else nested_concat(all_inputs, inputs_decode, padding_index=-100)
)
if labels_host is not None:
labels = nested_numpify(labels_host)
all_labels = (
labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
)

# Set back to None to begin a new accumulation
losses_host, preds_host, labels_host = None, None, None
losses_host, preds_host, inputs_host, labels_host = None, None, None, None

# nested concat does accumulation on tensors of variable length.
# Added mark step here to avoid graph recompile
Expand All @@ -935,6 +939,11 @@ def evaluation_loop(
preds_host = to_device_dtype(preds_host, target_dtype=torch.float32)
logits = nested_numpify(preds_host)
all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
if inputs_host is not None:
inputs_decode = nested_numpify(inputs_host)
all_inputs = (
inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
)
if labels_host is not None:
labels = nested_numpify(labels_host)
all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
Expand All @@ -960,10 +969,17 @@ def evaluation_loop(
all_preds = nested_truncate(all_preds, num_samples)
if all_labels is not None:
all_labels = nested_truncate(all_labels, num_samples)
if all_inputs is not None:
all_inputs = nested_truncate(all_inputs, num_samples)

# Metrics!
if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
if args.include_inputs_for_metrics:
metrics = self.compute_metrics(
EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
)
else:
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
else:
metrics = {}

Expand Down Expand Up @@ -1085,6 +1101,7 @@ def prediction_loop(
losses_host: torch.Tensor = None
preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None

world_size = max(1, args.world_size)

Expand All @@ -1097,6 +1114,7 @@ def prediction_loop(
make_multiple_of = dataloader.sampler.batch_size
preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)

model.eval()

Expand All @@ -1107,13 +1125,21 @@ def prediction_loop(

for step, inputs in enumerate(dataloader):
loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None

if loss is not None:
losses = loss.repeat(batch_size)
losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
if logits is not None:
preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
if labels is not None:
labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
if inputs_decode is not None:
inputs_host = (
inputs_decode
if inputs_host is None
else nested_concat(inputs_host, inputs_decode, padding_index=-100)
)
self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)

# Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
Expand All @@ -1122,9 +1148,10 @@ def prediction_loop(
if not prediction_loss_only:
preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))

# Set back to None to begin a new accumulation
losses_host, preds_host, labels_host = None, None, None
losses_host, preds_host, labels_host, inputs_host = None, None, None, None

# nested concat does accumulation on tensors of variable length.
# Added mark step here to avoid graph recompile
Expand All @@ -1140,13 +1167,20 @@ def prediction_loop(
if not prediction_loss_only:
preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))

eval_loss = eval_losses_gatherer.finalize()
preds = preds_gatherer.finalize() if not prediction_loss_only else None
label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None

if self.compute_metrics is not None and preds is not None and label_ids is not None:
metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
if args.include_inputs_for_metrics:
metrics = self.compute_metrics(
EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids)
)
else:
metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
else:
metrics = {}

Expand Down
2 changes: 1 addition & 1 deletion optimum/habana/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def __str__(self):
@torch_required
def _setup_devices(self) -> "torch.device":
logger.info("PyTorch: setting up devices")
if torch.distributed.is_initialized() and self.local_rank == -1:
if torch.distributed.is_available() and torch.distributed.is_initialized() and self.local_rank == -1:
logger.warning("torch.distributed process group is initialized, but local_rank == -1. ")
if self.no_cuda:
device = torch.device("cpu")
Expand Down
2 changes: 1 addition & 1 deletion optimum/habana/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "1.0.1"
__version__ = "1.1.0dev0"
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@


INSTALL_REQUIRES = [
"transformers == 4.18.0",
"transformers == 4.19.1",
"optimum",
"datasets",
"tokenizers",
"torch",
"sentencepiece",
"scipy",
"pillow",
"huggingface_hub==0.4.0",
]

TESTS_REQUIRE = [
Expand Down Expand Up @@ -51,7 +52,7 @@
"Programming Language :: Python :: 3.9",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
keywords="transformers, quantization, fine-tuning, gaudi, hpu",
keywords="transformers, mixed-precision training, fine-tuning, gaudi, hpu",
url="https://huggingface.co/hardware/habana",
author="HuggingFace Inc. Special Ops Team",
author_email="hardware@huggingface.co",
Expand Down
Loading