diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py index e83d455237..a8a483d5cf 100755 --- a/examples/stable-diffusion/text_to_image_generation.py +++ b/examples/stable-diffusion/text_to_image_generation.py @@ -411,6 +411,7 @@ def main(): kwargs_call["image"] = control_image # Instantiate a Stable Diffusion pipeline class + import habana_frameworks.torch.core as htcore if sdxl: # SDXL pipelines if controlnet: diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 0cd0cd28dd..7903f6b206 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -479,6 +479,23 @@ def __call__( `tuple`. When returning a tuple, the first element is a list with the generated images. """ + import os + quant_mode="disable" + quant_config_path = os.getenv('QUANT_CONFIG') + if quant_config_path != None: + import habana_frameworks.torch.core as htcore + htcore.hpu_set_env() + + from neural_compressor.torch.quantization import FP8Config, convert, prepare + config = FP8Config.from_json_file(quant_config_path) + if config.measure: + quant_mode="measure" + self.unet = prepare(self.unet, config) + elif config.quantize: + quant_mode="quantize" + self.unet = convert(self.unet, config) + htcore.hpu_initialize(self.unet, mark_only_scales_as_const=True) + callback = kwargs.pop("callback", None) callback_steps = kwargs.pop("callback_steps", None) @@ -495,7 +512,13 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", ) - with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast): + from contextlib import nullcontext + if quant_mode == "disable": + ctx_manager = torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast) + else: + ctx_manager = nullcontext() + + with ctx_manager: # 0. Default height and width to unet height = height or self.default_sample_size * self.vae_scale_factor width = width or self.default_sample_size * self.vae_scale_factor @@ -816,6 +839,10 @@ def __call__( hb_profiler.stop() + if quant_mode == "measure": + from neural_compressor.torch.quantization import finalize_calibration + finalize_calibration(self.unet) + speed_metrics_prefix = "generation" speed_measures = speed_metrics( split=speed_metrics_prefix,