Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/summarization/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,8 @@ python run_summarization.py \
--gaudi_config_name Habana/t5 \
--ignore_pad_token_for_loss False \
--pad_to_max_length \
--bf16
--bf16 \
--bf16_full_eval
```

You can run inference with BART on the CNN-DailyMail dataset on 1 Gaudi card with the following command:
Expand Down
21 changes: 21 additions & 0 deletions optimum/habana/transformers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,11 @@ def train(

self.is_in_train = True

# do_train is not a reliable argument, as it might not be set and .train() still called, so
# the following is a workaround:
if (args.fp16_full_eval or args.bf16_full_eval) and not args.do_train:
self._move_model_to_device(self.model, args.device)

if "model_path" in kwargs:
resume_from_checkpoint = kwargs.pop("model_path")
warnings.warn(
Expand Down Expand Up @@ -1510,6 +1515,14 @@ def evaluation_loop(
)
self.already_wrapped_for_hpu_graphs = True

# if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
# while ``train`` is running, cast it to the right dtype first and then put on device
if not self.is_in_train:
if args.fp16_full_eval:
model = model.to(dtype=torch.float16, device=args.device)
elif args.bf16_full_eval:
model = model.to(dtype=torch.bfloat16, device=args.device)

Comment thread
bhargaveede marked this conversation as resolved.
batch_size = self.args.eval_batch_size

logger.info(f"***** Running {description} *****")
Expand Down Expand Up @@ -1903,6 +1916,14 @@ def prediction_loop(
)
self.already_wrapped_for_hpu_graphs = True

# if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
# while ``train`` is running, cast it to the right dtype first and then put on device
if not self.is_in_train:
if args.fp16_full_eval:
model = model.to(dtype=torch.float16, device=args.device)
elif args.bf16_full_eval:
model = model.to(dtype=torch.bfloat16, device=args.device)

Comment thread
bhargaveede marked this conversation as resolved.
batch_size = dataloader.batch_size
num_examples = self.num_examples(dataloader)
logger.info(f"***** Running {description} *****")
Expand Down
3 changes: 0 additions & 3 deletions optimum/habana/transformers/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@

# List of arguments that are not supported by optimum-habana
UNSUPPORTED_ARGUMENTS = [
"bf16_full_eval",
"fp16",
"fp16_backend",
"fp16_full_eval",
Expand Down Expand Up @@ -314,8 +313,6 @@ def __post_init__(self):
raise ValueError("must be using hpu graphs to set max_hpu_graphs.")

# Raise errors for arguments that are not supported by optimum-habana
if self.bf16_full_eval:
raise ValueError("--bf16_full_eval is not supported by optimum-habana.")
if self.fp16 or self.fp16_full_eval:
raise ValueError(
"--fp16, --fp16_backend, --fp16_full_eval and --fp16_opt_level are not"
Expand Down
6 changes: 4 additions & 2 deletions tests/test_encoder_decoder_text_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@
MODELS_TO_TEST = {
"bf16": [
("facebook/bart-large-cnn", "Habana/bart", 4.691, 26.0688, 2, 1),
("t5-3b", "Habana/t5", 2.28, 21.56, 2, 1),
("t5-3b", "Habana/t5", 2.88, 21.56, 2, 1),
],
}
else:
# Gaudi1 CI baselines
MODELS_TO_TEST = {
"bf16": [
("facebook/bart-large-cnn", "Habana/bart", 2.588, 26.0688, 2, 1),
("t5-3b", "Habana/t5", 0.585, 21.72, 2, 1),
("t5-3b", "Habana/t5", 0.98, 21.56, 2, 1),
Comment on lines 17 to +26
Copy link
Copy Markdown
Collaborator

@regisss regisss Jan 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For Gaudi1, I get a RougeLsum of 21.3831 and a throughput of 1.005. It doesn't matter much since the test passes (no need to update the numbers).
For Gaudi2 however, runs are not reproducible it seems. I get different RougeLsum from one run to another, is it something you also observed?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't get different RougeLsum. When I added perf numbers, I ran the test twice to check and I was getting same RogueLsum. Let me check again.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, did you run it with Synapse 1.13?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could see the variation. However, I'm seeing variation on v1.9-release too for the test "test_run_summarization_t5-small_multi_card". Can you confirm if it's same on your end

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I cannot run multi-card tests on my Gaudi2 instance at the moment but if you observed the same behavior for "test_run_summarization_t5-small_multi_card" it means that this "issue" was already there before.
Anyway, tests still pass so I'm going to merge it and I'll investigate that later.

],
}

Expand Down Expand Up @@ -76,6 +76,8 @@ def _test_text_summarization(

if not deepspeed:
command.append("--bf16")
if model_name == "t5-3b":
command.append("--bf16_full_eval")

with TemporaryDirectory() as tmp_dir:
command.append(f"--output_dir {tmp_dir}")
Expand Down