From 3bbcf345136b8047bb13fc52212ee58d989a2297 Mon Sep 17 00:00:00 2001
From: Sergey Plotnikov <sergey.plotnikov@intel.com>
Date: Tue, 17 Sep 2024 14:53:36 -0700
Subject: [PATCH 1/2] Enable FP8 quantization in SDXL using INC

---
 .../text_to_image_generation.py               |  9 ++++++
 .../pipeline_stable_diffusion_xl.py           | 28 ++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py
index e83d455237..a800a6db03 100755
--- a/examples/stable-diffusion/text_to_image_generation.py
+++ b/examples/stable-diffusion/text_to_image_generation.py
@@ -271,6 +271,12 @@ def main():
         action="store_true",
         help="Enable deterministic generation using CPU Generator",
     )
+    parser.add_argument(
+        "--quant_mode",
+        default="disable",
+        type=str,
+        help="Quantization mode 'measure', 'quantize' or 'disable'",
+    )
     parser.add_argument(
         "--use_compel",
         action="store_true",
@@ -410,7 +416,10 @@ def main():
             control_image = Image.fromarray(image)
         kwargs_call["image"] = control_image
 
+    kwargs_call["quant_mode"] = args.quant_mode
+
     # Instantiate a Stable Diffusion pipeline class
+    import habana_frameworks.torch.core as htcore
     if sdxl:
         # SDXL pipelines
         if controlnet:
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 0cd0cd28dd..f06d320337 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -479,6 +479,22 @@ def __call__(
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
 
+        quant_mode=kwargs["quant_mode"]
+        if quant_mode == "measure" or quant_mode == "quantize":
+            import os
+            quant_config_path = os.getenv('QUANT_CONFIG')
+
+            import habana_frameworks.torch.core as htcore
+            htcore.hpu_set_env()
+
+            from neural_compressor.torch.quantization import FP8Config, convert, prepare
+            config = FP8Config.from_json_file(quant_config_path)
+            if config.measure:
+                self.unet = prepare(self.unet, config)
+            elif config.quantize:
+                self.unet = convert(self.unet, config) 
+            htcore.hpu_initialize(self.unet, mark_only_scales_as_const=True)
+
         callback = kwargs.pop("callback", None)
         callback_steps = kwargs.pop("callback_steps", None)
 
@@ -495,7 +511,13 @@ def __call__(
                 "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
             )
 
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
+        from contextlib import nullcontext
+        if quant_mode == "disable":
+            ctx_manager = torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast)
+        else:
+            ctx_manager = nullcontext()
+
+        with ctx_manager:
             # 0. Default height and width to unet
             height = height or self.default_sample_size * self.vae_scale_factor
             width = width or self.default_sample_size * self.vae_scale_factor
@@ -816,6 +838,10 @@ def __call__(
 
             hb_profiler.stop()
 
+            if quant_mode == "measure":
+                from neural_compressor.torch.quantization import finalize_calibration
+                finalize_calibration(self.unet)
+
             speed_metrics_prefix = "generation"
             speed_measures = speed_metrics(
                 split=speed_metrics_prefix,

From ac4de48c161fe2d72195e7ba6999b0afed690239 Mon Sep 17 00:00:00 2001
From: Sergey Plotnikov <sergey.plotnikov@intel.com>
Date: Wed, 27 Nov 2024 15:23:23 -0800
Subject: [PATCH 2/2] Replace command line parameter by enc variable for SDXL
 quant control

---
 examples/stable-diffusion/text_to_image_generation.py |  8 --------
 .../pipeline_stable_diffusion_xl.py                   | 11 ++++++-----
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py
index a800a6db03..a8a483d5cf 100755
--- a/examples/stable-diffusion/text_to_image_generation.py
+++ b/examples/stable-diffusion/text_to_image_generation.py
@@ -271,12 +271,6 @@ def main():
         action="store_true",
         help="Enable deterministic generation using CPU Generator",
     )
-    parser.add_argument(
-        "--quant_mode",
-        default="disable",
-        type=str,
-        help="Quantization mode 'measure', 'quantize' or 'disable'",
-    )
     parser.add_argument(
         "--use_compel",
         action="store_true",
@@ -416,8 +410,6 @@ def main():
             control_image = Image.fromarray(image)
         kwargs_call["image"] = control_image
 
-    kwargs_call["quant_mode"] = args.quant_mode
-
     # Instantiate a Stable Diffusion pipeline class
     import habana_frameworks.torch.core as htcore
     if sdxl:
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index f06d320337..7903f6b206 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -479,19 +479,20 @@ def __call__(
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
 
-        quant_mode=kwargs["quant_mode"]
-        if quant_mode == "measure" or quant_mode == "quantize":
-            import os
-            quant_config_path = os.getenv('QUANT_CONFIG')
-
+        import os
+        quant_mode="disable"
+        quant_config_path = os.getenv('QUANT_CONFIG')
+        if quant_config_path != None:
             import habana_frameworks.torch.core as htcore
             htcore.hpu_set_env()
 
             from neural_compressor.torch.quantization import FP8Config, convert, prepare
             config = FP8Config.from_json_file(quant_config_path)
             if config.measure:
+                quant_mode="measure"
                 self.unet = prepare(self.unet, config)
             elif config.quantize:
+                quant_mode="quantize"
                 self.unet = convert(self.unet, config) 
             htcore.hpu_initialize(self.unet, mark_only_scales_as_const=True)