intel · XuehaoSun · Oct 24, 2025 · Oct 13, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/examples/pytorch/diffusion_model/diffusers/flux/README.md b/examples/pytorch/diffusion_model/diffusers/flux/README.md
@@ -0,0 +1,34 @@
+# Step-by-Step
+
+This example quantizes and validates the accuracy of Flux.
+
+# Prerequisite
+
+## 1. Environment
+
+```shell
+pip install -r requirements.txt
+# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/[email protected]` for the latest updates before neural-compressor v3.6 release
+pip install neural-compressor-pt==3.6
+# Use `pip install git+https://github.com/intel/[email protected]` for the latest updates before auto-round v0.8.0 release
+pip install auto-round==0.8.0
+```
+
+## 2. Prepare Model
+
+```shell
+hf download black-forest-labs/FLUX.1-dev --local-dir FLUX.1-dev
+```
+
+## 3. Prepare Dataset
+```shell
+wget https://github.com/mlcommons/inference/raw/refs/heads/master/text_to_image/coco2014/captions/captions_source.tsv
+```
+
+# Run
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=flux_fp8 --input_model=FLUX.1-dev
+```
+- topology: support flux_fp8 and flux_mxfp8
+- CUDA_VISIBLE_DEVICES: split the evaluation file into the number of GPUs' subset to speed up the evaluation 
diff --git a/examples/pytorch/diffusion_model/diffusers/flux/main.py b/examples/pytorch/diffusion_model/diffusers/flux/main.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import argparse
+
+import pandas as pd
+import tabulate
+import torch
+
+from diffusers import AutoPipelineForText2Image
+from neural_compressor.torch.quantization import (
+    AutoRoundConfig,
+    convert,
+    prepare,
+)
+import multiprocessing as mp
+
+from auto_round.compressors.diffusion.eval import metric_map
+from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader
+
+
+def inference_worker(device, eval_file, pipe, image_save_dir):
+    if device != "cpu":
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(device)
+        torch.cuda.set_device(device)
+
+    gen_kwargs = {
+        "guidance_scale": 7.5,
+        "num_inference_steps": 50,
+        "generator": None,
+    }
+
+    dataloader, _, _ = get_diffusion_dataloader(eval_file, nsamples=-1, bs=1)
+    prompt_list = []
+    image_list = []
+    for image_ids, prompts in dataloader:
+        prompt_list.extend(prompts)
+
+        new_ids = []
+        new_prompts = []
+        for idx, image_id in enumerate(image_ids):
+            image_id = image_id.item()
+            image_list.append(os.path.join(image_save_dir, str(image_id) + ".png"))
+
+            if os.path.exists(os.path.join(image_save_dir, str(image_id) + ".png")):
+                continue
+            new_ids.append(image_id)
+            new_prompts.append(prompts[idx])
+
+        if len(new_prompts) == 0:
+            continue
+
+        output = pipe(prompt=new_prompts, **gen_kwargs)
+        for idx, image_id in enumerate(new_ids):
+            output.images[idx].save(os.path.join(image_save_dir, str(image_id) + ".png"))
+
+    return prompt_list, image_list
+
+class BasicArgumentParser(argparse.ArgumentParser):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.add_argument("--model", "--model_name", "--model_name_or_path",
+                          help="model name or path")
+
+        self.add_argument('--scheme', default="MXFP4", type=str,
+                          help="quantizaion scheme.")
+
+        self.add_argument("--quantize", action="store_true")
+
+        self.add_argument("--inference", action="store_true")
+
+        self.add_argument("--dataset", type=str, default="coco2014",
+                          help="the dataset for quantization training.")
+
+        self.add_argument("--output_dir", default="./tmp_autoround", type=str,
+                          help="the directory to save quantized model")
+
+        self.add_argument("--eval_dataset", default="captions_source.tsv", type=str,
+                          help="eval datasets")
+
+        self.add_argument("--output_image_path", default="./tmp_imgs", type=str,
+                          help="the directory to save quantized model")
+
+
+def setup_parser():
+    parser = BasicArgumentParser()
+
+    parser.add_argument("--iters", "--iter", default=1000, type=int,
+                        help="tuning iters")
+
+    args = parser.parse_args()
+    return args
+
+
+def tune(args):
+    model_name = args.model
+    if model_name[-1] == "/":
+        model_name = model_name[:-1]
+    print(f"start to quantize {model_name}")
+
+    use_auto_mapping = True
+    layer_config = {}
+    pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16)
+    model = pipe.transformer
+    kwargs = {}
+    if args.scheme == "FP8":
+        for n, m in model.named_modules():
+            if m.__class__.__name__ == "Linear":
+                layer_config[n] = {"bits": 8, "act_bits": 8, "data_type": "fp", "act_data_type": "fp", "group_size": 0, "act_group_size": 0}
+    elif args.scheme == "MXFP8":
+        kwargs["scheme"] = "MXFP8"
+
+    qconfig = AutoRoundConfig(
+        iters=args.iters,
+        dataset=args.dataset,
+        layer_config=layer_config,
+        num_inference_steps=3,
+        export_format="fake",
+        nsamples=128,
+        batch_size=1,
+        **kwargs
+    )
+    model = prepare(model, qconfig)
+    model = convert(model, qconfig, pipeline=pipe)
+    return model, pipe
+
+if __name__ == '__main__':
+    mp.set_start_method('spawn', force=True)
+    args = setup_parser()
+    model, pipe = tune(args)
+    if "--inference" in sys.argv:
+        if not os.path.exists(args.output_image_path):
+            os.makedirs(args.output_image_path)
+
+        visible_gpus = torch.cuda.device_count()
+
+        if visible_gpus == 0:
+            prompt_list, image_list = inference_worker("cpu", args.eval_dataset, pipe, args.output_image_path)
+
+        else:
+            df = pd.read_csv(args.eval_dataset, sep='\t')
+            subsut_sample_num = len(df) // visible_gpus
+            for i in range(visible_gpus):
+                start = i * subsut_sample_num
+                end = min((i + 1) * subsut_sample_num, len(df))
+                df_subset = df.iloc[start : end]
+                df_subset.to_csv(f"subset_{i}.tsv", sep='\t', index=False)
+
+            pipe.model = model
+
+            with mp.Pool(processes=visible_gpus) as pool:
+                results = [pool.apply_async(inference_worker, (i,  f"subset_{i}.tsv", pipe.to(f"cuda:{i}"), args.output_image_path)) for i in range(visible_gpus)]
+                outputs = [r.get() for r in results]
+
+            prompt_list = []
+            image_list = []
+            for output in outputs:
+                prompt_list.extend(output[0])
+                image_list.extend(output[1])
+
+            print("Evaluations for subset are done! Getting the final accuracy...")
+
+        result = {}
+        metrics = ["clip", "clip-iqa", "imagereward"]
+        for metric in metrics:
+            result.update(metric_map[metric](prompt_list, image_list, pipe.device))
+
+        print(tabulate.tabulate(result.items(), tablefmt="grid"))
diff --git a/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt b/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt
@@ -0,0 +1,5 @@
+diffusers
+clip
+image-reward
+torchmetrics
+transformers==4.55.0
diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    tuned_checkpoint=${tuned_checkpoint:="saved_results"}
+
+    if [ "${topology}" = "flux_fp8" ]; then
+        extra_cmd="--scheme FP8 --iters 0 --dataset captions_source.tsv"
+    elif [ "${topology}" = "flux_mxfp8" ]; then
+        extra_cmd="--scheme MXFP8 --iters 10 --dataset captions_source.tsv"
+    fi
+
+    python3 main.py \
+    	--model ${input_model} \
+		--output_dir ${tuned_checkpoint} \
+		--quantize \
+		--inference \
+		${extra_cmd}
+}
+
+main "$@"
diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -100,6 +100,10 @@ def __init__(
         truncation: bool = False,
         # 0.7
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
+        # diffusion
+        guidance_scale: float = 7.5,
+        num_inference_steps: int = 50,
+        generator_seed: int = None,
         **kwargs,
     ):
         """Init a AutQRoundQuantizer object.
@@ -172,6 +176,10 @@ def __init__(
             template (Template): The template to specify process for different mllms.
             truncation (bool): Activates truncation to cut input sequences longer than `max_length` to `max_length`.
             scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations.
+            guidance_scale (float): Control how much the image generation process follows the text prompt.
+                                    The more it is, the more closely it follows the prompt (default is 7.5).
+            num_inference_steps (int): The reference number of denoising steps (default is 50).
+            generator_seed (int): A seed that controls the initial noise for image generation (default is None).
 
         Returns:
             The quantized model.
@@ -227,6 +235,9 @@ def __init__(
         self.device_map = device_map
         self.quant_lm_head = quant_lm_head
         self.enable_w4afp8 = self._is_w4afp8()
+        self.guidance_scale = guidance_scale
+        self.num_inference_steps = num_inference_steps
+        self.generator_seed = generator_seed
 
     def _is_w4afp8(self) -> bool:
         return any([v.get("data_type", None) == "fp8_to_int_sym" for v in self.quant_config.values()])
@@ -252,13 +263,16 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
         Returns:
             The quantized model.
         """
+        pipe = kwargs.pop("pipeline", None)
         tokenizer = getattr(model.orig_model, "tokenizer", None)
         if tokenizer is not None:
             delattr(model.orig_model, "tokenizer")
-        else:
+        elif pipe is None:
             tokenizer = "Placeholder"
             self.dataset = CapturedDataloader(model.args_list, model.kwargs_list)
         model = model.orig_model
+        if pipe is not None:
+            model = pipe
         rounder = AutoRound(
             model,
             layer_config=self.layer_config,
@@ -307,6 +321,9 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             truncation=self.truncation,
             enable_torch_compile=self.enable_torch_compile,
             quant_lm_head=self.quant_lm_head,
+            guidance_scale=self.guidance_scale,
+            num_inference_steps=self.num_inference_steps,
+            generator_seed=self.generator_seed,
         )
 
         if self.enable_w4afp8:

diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -608,6 +608,7 @@ def autoround_quantize_entry(
                 "act_data_type": act_data_type,
             }
             layer_config = quant_config.to_dict().get("layer_config", None)
+            dataset = quant_config.to_dict().get("dataset", "NeelNanda/pile-10k")
             output_dir = quant_config.to_dict().get("output_dir", "temp_auto_round")
             enable_full_range = quant_config.enable_full_range
             batch_size = quant_config.batch_size
@@ -642,6 +643,9 @@ def autoround_quantize_entry(
             scheme = quant_config.scheme
             device_map = quant_config.device_map
             quant_lm_head = quant_config.quant_lm_head
+            guidance_scale = quant_config.to_dict().get("guidance_scale", 7.5)
+            num_inference_steps = quant_config.to_dict().get("num_inference_steps", 50)
+            generator_seed = quant_config.to_dict().get("generator_seed", None)
 
     kwargs.pop("example_inputs")
     quantizer = get_quantizer(
@@ -665,6 +669,7 @@ def autoround_quantize_entry(
         batch_size=batch_size,
         amp=amp,
         lr_scheduler=lr_scheduler,
+        dataset=dataset,
         enable_quanted_input=enable_quanted_input,
         enable_minmax_tuning=enable_minmax_tuning,
         lr=lr,
@@ -694,6 +699,9 @@ def autoround_quantize_entry(
         scheme=scheme,
         device_map=device_map,
         quant_lm_head=quant_lm_head,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        generator_seed=generator_seed,
     )
     model = quantizer.execute(model=model, mode=mode, *args, **kwargs)
     model.qconfig = configs_mapping

diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
@@ -228,6 +228,7 @@ def convert(
     model: torch.nn.Module,
     quant_config: BaseConfig = None,
     inplace: bool = True,
+    **kwargs,
 ):
     """Convert the prepared model to a quantized model.
 
@@ -284,6 +285,7 @@ def convert(
                 configs_mapping,
                 example_inputs=example_inputs,
                 mode=Mode.CONVERT,
+                **kwargs,
             )
     setattr(q_model, "is_quantized", True)
     return q_model