Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions examples/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ def setup_parser(parser):
action="store_true",
help="Whether to use the key/value cache for decoding. It should speed up generation.",
)
parser.add_argument(
"--use_torch_compile",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"--use_torch_compile",
"--torch_compile",

to be aligned with Transformers and GaudiTrainingArguments

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. I would change.

action="store_true",
help="Whether to use torch compiled model or not.",
)
parser.add_argument(
"--use_hpu_graphs",
action="store_true",
Expand Down Expand Up @@ -234,6 +239,9 @@ def setup_parser(parser):

args = parser.parse_args()

if args.use_torch_compile:
args.use_hpu_graphs = False

if not args.use_hpu_graphs:
args.limit_hpu_graphs = False

Expand Down Expand Up @@ -297,7 +305,8 @@ def generate(size=None, reduce_recompile=False):
outputs = model.generate(
**input_tokens,
generation_config=generation_config,
lazy_mode=True,
lazy_mode=True if not args.use_torch_compile else False,
torch_compile = args.use_torch_compile,
hpu_graphs=args.use_hpu_graphs,
profiling_steps=args.profiling_steps,
profiling_warmup_steps=args.profiling_warmup_steps,
Expand Down Expand Up @@ -477,7 +486,8 @@ def generate_dataset(batch):
outputs = model.generate(
**batch,
generation_config=generation_config,
lazy_mode=True,
lazy_mode=True if not args.use_torch_compile else False,
torch_compile = args.use_torch_compile,
hpu_graphs=args.use_hpu_graphs,
profiling_steps=args.profiling_steps,
profiling_warmup_steps=args.profiling_warmup_steps,
Expand Down
39 changes: 31 additions & 8 deletions optimum/habana/transformers/generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ def generate(
negative_prompt_ids: Optional[torch.Tensor] = None,
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
lazy_mode: Optional[bool] = False,
torch_compile: Optional[bool] = False,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For normal training, eval, predict models are wrapped within accelerator.prepare_model() call, adding new code for generate() may not be aligned. @regisss any idea how direct model.generate() calls are handled in transformers for compile mode, I tried to search there but did not find anything.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the trainer, the link with Accelerate is made here:


And then in Accelerate it happens here:
if self.state.dynamo_plugin.backend != GaudiDynamoBackend.NO and not is_compiled_module(model):

It was introduced in #465.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Outside of the trainer, Transformers recommends to simply use:

model = torch.compile(model)

https://huggingface.co/docs/transformers/v4.36.1/en/perf_torch_compile

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As suggested, I would create 'get_torch_compiled_model()' in text-generation/utils.py. And this will be called inside setup_model() in text-generation/utils.py.

hpu_graphs: Optional[bool] = False,
profiling_warmup_steps: Optional[int] = 0,
profiling_steps: Optional[int] = 0,
Expand Down Expand Up @@ -474,6 +475,8 @@ def generate(
Attention_mask for `negative_prompt_ids`.
lazy_mode (`bool`, *optional*, defaults to `False`):
Whether the run is executed in lazy mode or not (i.e. eager mode).
torch_compile (`bool`, *optional*, defaults to `False`):
Whether the run is executed with torch.compile model or not.
hpu_graphs (`bool`, *optional*, defaults to `False`):
Whether to use HPU graphs for inference.
profiling_warmup_steps (`int`, *optional*, defaults to 0):
Expand Down Expand Up @@ -513,6 +516,10 @@ def generate(
raise ValueError(
"`hpu_graphs` is True but `lazy_mode` is False. HPU graphs require `lazy_mode` to be set to True."
)
if torch_compile and (lazy_mode or hpu_graphs):
raise ValueError(
"`torch_compile` is True. This requires both `lazy_mode` and `hpu_graphs` to be set to False."
)

# priority: `generation_config` argument > `model.generation_config` (the default generation config)
if generation_config is None:
Expand Down Expand Up @@ -838,6 +845,7 @@ def generate(
synced_gpus=synced_gpus,
streamer=streamer,
lazy_mode=lazy_mode,
torch_compile=torch_compile,
ignore_eos=generation_config.ignore_eos,
profiling_warmup_steps=profiling_warmup_steps,
profiling_steps=profiling_steps,
Expand Down Expand Up @@ -1214,6 +1222,7 @@ def greedy_search(
synced_gpus: bool = False,
streamer: Optional["BaseStreamer"] = None,
lazy_mode: Optional[bool] = False,
torch_compile: Optional[bool] = False,
ignore_eos: Optional[bool] = False,
profiling_warmup_steps: Optional[int] = 0,
profiling_steps: Optional[int] = 0,
Expand Down Expand Up @@ -1265,6 +1274,8 @@ def greedy_search(
through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
lazy_mode (`bool`, *optional*, defaults to `False`):
Whether the run is executed in lazy mode or not (i.e. eager mode).
torch_compile (`bool`, *optional*, defaults to `False`):
Whether the run is executed with torch.compile model or not.
ignore_eos (`bool`, *optional*, defaults to `False`):
Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
profiling_warmup_steps (`int`, *optional*, defaults to 0):
Expand Down Expand Up @@ -1403,14 +1414,26 @@ def greedy_search(

hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)

# forward pass to get next token
outputs = self(
**model_inputs,
return_dict=True,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
**hpu_graphs_kwargs,
)
if torch_compile:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wrapping model only for greedy_search does not look right, it should probably be done in generate() so that it works for other modes (such as beam_search also),

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not even sure we should do it in generate at all. If using the trainer, it should already be taken care of (see discussion above). Otherwise, for example in the text-generation example, I think we should just have a get_torch_compiled_model in text-generation/utils.py. That seems to be the way recommended by Transformers.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@regisss thanks for your comments, we will check if we can go with adding get_torch_compiled_model in text-generation/utils.py

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. I would create 'get_torch_compiled_model' in text-generation/utils.py.

# apply torch.compile
compiled_model = torch.compile(self, backend="aot_hpu_inference_backend")
# forward pass to get next token
outputs = compiled_model(
**model_inputs,
return_dict=True,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
**hpu_graphs_kwargs,
)
else:
# forward pass to get next token
outputs = self(
**model_inputs,
return_dict=True,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
**hpu_graphs_kwargs,
)

if synced_gpus and this_peer_finished:
continue # don't waste resources running the code we don't need
Expand Down