Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/stable-diffusion/text_to_image_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@ def main():
kwargs_call["image"] = control_image

# Instantiate a Stable Diffusion pipeline class
import habana_frameworks.torch.core as htcore
if sdxl:
# SDXL pipelines
if controlnet:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,23 @@ def __call__(
`tuple`. When returning a tuple, the first element is a list with the generated images.
"""

import os
quant_mode="disable"
quant_config_path = os.getenv('QUANT_CONFIG')
if quant_config_path != None:
import habana_frameworks.torch.core as htcore
htcore.hpu_set_env()

from neural_compressor.torch.quantization import FP8Config, convert, prepare
config = FP8Config.from_json_file(quant_config_path)
if config.measure:
quant_mode="measure"
self.unet = prepare(self.unet, config)
elif config.quantize:
quant_mode="quantize"
self.unet = convert(self.unet, config)
Copy link
Copy Markdown

@kumarans-ai kumarans-ai Sep 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All Unet steps should not be converted to Fp8, Recipe for good accuracy includes last few steps in bf16. So, you must have the bf16 Unet support - refer Model Garden reference for Fp8- HQT to INC conversion patch

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, I'll submit it as a separate PR. Original intention of this PR is to show how easy it is to quantize model, so I prefer to keep code change to bare minimum.

htcore.hpu_initialize(self.unet, mark_only_scales_as_const=True)

callback = kwargs.pop("callback", None)
callback_steps = kwargs.pop("callback_steps", None)

Expand All @@ -495,7 +512,13 @@ def __call__(
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
)

with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
from contextlib import nullcontext
if quant_mode == "disable":
ctx_manager = torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast)
else:
ctx_manager = nullcontext()

with ctx_manager:
# 0. Default height and width to unet
height = height or self.default_sample_size * self.vae_scale_factor
width = width or self.default_sample_size * self.vae_scale_factor
Expand Down Expand Up @@ -816,6 +839,10 @@ def __call__(

hb_profiler.stop()

if quant_mode == "measure":
from neural_compressor.torch.quantization import finalize_calibration
finalize_calibration(self.unet)

speed_metrics_prefix = "generation"
speed_measures = speed_metrics(
split=speed_metrics_prefix,
Expand Down