huggingface · splotnikv · Sep 17, 2024 · Nov 27, 2024 · kumarans-ai · Sep 20, 2024
@@ -411,6 +411,7 @@ def main():
         kwargs_call["image"] = control_image
 
     # Instantiate a Stable Diffusion pipeline class
+    import habana_frameworks.torch.core as htcore
     if sdxl:
         # SDXL pipelines
         if controlnet:

@@ -479,6 +479,23 @@ def __call__(
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
 
+        import os
+        quant_mode="disable"
+        quant_config_path = os.getenv('QUANT_CONFIG')
+        if quant_config_path != None:
+            import habana_frameworks.torch.core as htcore
+            htcore.hpu_set_env()
+
+            from neural_compressor.torch.quantization import FP8Config, convert, prepare
+            config = FP8Config.from_json_file(quant_config_path)
+            if config.measure:
+                quant_mode="measure"
+                self.unet = prepare(self.unet, config)
+            elif config.quantize:
+                quant_mode="quantize"
+                self.unet = convert(self.unet, config) 
+            htcore.hpu_initialize(self.unet, mark_only_scales_as_const=True)
+
         callback = kwargs.pop("callback", None)
         callback_steps = kwargs.pop("callback_steps", None)
 
@@ -495,7 +512,13 @@ def __call__(
                 "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
             )
 
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
+        from contextlib import nullcontext
+        if quant_mode == "disable":
+            ctx_manager = torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast)
+        else:
+            ctx_manager = nullcontext()
+
+        with ctx_manager:
             # 0. Default height and width to unet
             height = height or self.default_sample_size * self.vae_scale_factor
             width = width or self.default_sample_size * self.vae_scale_factor
@@ -816,6 +839,10 @@ def __call__(
 
             hb_profiler.stop()
 
+            if quant_mode == "measure":
+                from neural_compressor.torch.quantization import finalize_calibration
+                finalize_calibration(self.unet)
+
             speed_metrics_prefix = "generation"
             speed_measures = speed_metrics(
                 split=speed_metrics_prefix,