From 3bbcf345136b8047bb13fc52212ee58d989a2297 Mon Sep 17 00:00:00 2001 From: Sergey Plotnikov Date: Tue, 17 Sep 2024 14:53:36 -0700 Subject: [PATCH 1/2] Enable FP8 quantization in SDXL using INC --- .../text_to_image_generation.py | 9 ++++++ .../pipeline_stable_diffusion_xl.py | 28 ++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py index e83d455237..a800a6db03 100755 --- a/examples/stable-diffusion/text_to_image_generation.py +++ b/examples/stable-diffusion/text_to_image_generation.py @@ -271,6 +271,12 @@ def main(): action="store_true", help="Enable deterministic generation using CPU Generator", ) + parser.add_argument( + "--quant_mode", + default="disable", + type=str, + help="Quantization mode 'measure', 'quantize' or 'disable'", + ) parser.add_argument( "--use_compel", action="store_true", @@ -410,7 +416,10 @@ def main(): control_image = Image.fromarray(image) kwargs_call["image"] = control_image + kwargs_call["quant_mode"] = args.quant_mode + # Instantiate a Stable Diffusion pipeline class + import habana_frameworks.torch.core as htcore if sdxl: # SDXL pipelines if controlnet: diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 0cd0cd28dd..f06d320337 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -479,6 +479,22 @@ def __call__( `tuple`. When returning a tuple, the first element is a list with the generated images. """ + quant_mode=kwargs["quant_mode"] + if quant_mode == "measure" or quant_mode == "quantize": + import os + quant_config_path = os.getenv('QUANT_CONFIG') + + import habana_frameworks.torch.core as htcore + htcore.hpu_set_env() + + from neural_compressor.torch.quantization import FP8Config, convert, prepare + config = FP8Config.from_json_file(quant_config_path) + if config.measure: + self.unet = prepare(self.unet, config) + elif config.quantize: + self.unet = convert(self.unet, config) + htcore.hpu_initialize(self.unet, mark_only_scales_as_const=True) + callback = kwargs.pop("callback", None) callback_steps = kwargs.pop("callback_steps", None) @@ -495,7 +511,13 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", ) - with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast): + from contextlib import nullcontext + if quant_mode == "disable": + ctx_manager = torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast) + else: + ctx_manager = nullcontext() + + with ctx_manager: # 0. Default height and width to unet height = height or self.default_sample_size * self.vae_scale_factor width = width or self.default_sample_size * self.vae_scale_factor @@ -816,6 +838,10 @@ def __call__( hb_profiler.stop() + if quant_mode == "measure": + from neural_compressor.torch.quantization import finalize_calibration + finalize_calibration(self.unet) + speed_metrics_prefix = "generation" speed_measures = speed_metrics( split=speed_metrics_prefix, From ac4de48c161fe2d72195e7ba6999b0afed690239 Mon Sep 17 00:00:00 2001 From: Sergey Plotnikov Date: Wed, 27 Nov 2024 15:23:23 -0800 Subject: [PATCH 2/2] Replace command line parameter by enc variable for SDXL quant control --- examples/stable-diffusion/text_to_image_generation.py | 8 -------- .../pipeline_stable_diffusion_xl.py | 11 ++++++----- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py index a800a6db03..a8a483d5cf 100755 --- a/examples/stable-diffusion/text_to_image_generation.py +++ b/examples/stable-diffusion/text_to_image_generation.py @@ -271,12 +271,6 @@ def main(): action="store_true", help="Enable deterministic generation using CPU Generator", ) - parser.add_argument( - "--quant_mode", - default="disable", - type=str, - help="Quantization mode 'measure', 'quantize' or 'disable'", - ) parser.add_argument( "--use_compel", action="store_true", @@ -416,8 +410,6 @@ def main(): control_image = Image.fromarray(image) kwargs_call["image"] = control_image - kwargs_call["quant_mode"] = args.quant_mode - # Instantiate a Stable Diffusion pipeline class import habana_frameworks.torch.core as htcore if sdxl: diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index f06d320337..7903f6b206 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -479,19 +479,20 @@ def __call__( `tuple`. When returning a tuple, the first element is a list with the generated images. """ - quant_mode=kwargs["quant_mode"] - if quant_mode == "measure" or quant_mode == "quantize": - import os - quant_config_path = os.getenv('QUANT_CONFIG') - + import os + quant_mode="disable" + quant_config_path = os.getenv('QUANT_CONFIG') + if quant_config_path != None: import habana_frameworks.torch.core as htcore htcore.hpu_set_env() from neural_compressor.torch.quantization import FP8Config, convert, prepare config = FP8Config.from_json_file(quant_config_path) if config.measure: + quant_mode="measure" self.unet = prepare(self.unet, config) elif config.quantize: + quant_mode="quantize" self.unet = convert(self.unet, config) htcore.hpu_initialize(self.unet, mark_only_scales_as_const=True)