Skip to content

Commit 4151ffe

Browse files
authored
fix regression issue and command in mix-precision example (#2317)
Signed-off-by: He, Xin3 <[email protected]>
1 parent 8a4be4f commit 4151ffe

File tree

5 files changed

+1607
-1600
lines changed

5 files changed

+1607
-1600
lines changed

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ pip install -r requirements.txt
2727
### Demo (`MXFP4`, `MXFP8`, `NVFP4`, `uNVFP4`)
2828

2929
```bash
30-
python quantize.py --model_name_or_path facebook/opt-125m --quantize --dtype MXFP4 --batch_size 8 --accuracy
30+
python quantize.py --model_name_or_path facebook/opt-125m --quantize --dtype MXFP4 --batch_size 8 --accuracy --enable_torch_compile
3131
```
3232

3333
### Mix-precision Quantization (`MXFP4 + MXFP8`)
@@ -41,10 +41,11 @@ python quantize.py \
4141
--use_recipe \
4242
--recipe_file recipes/Meta-Llama-3.1-8B-Instruct_7bits.json \
4343
--accuracy \
44-
--batch_size 32
44+
--batch_size 32 \
45+
--enable_torch_compile
4546

4647
# Llama 3.3 70B
47-
deepspeed --include="localhost:4,5,6,7" --master_port=29500 python quantize.py \
48+
deepspeed --include="localhost:0,1,2,3" --master_port=29500 quantize.py \
4849
--model_name_or_path meta-llama/Llama-3.3-70B-Instruct/ \
4950
--quantize \
5051
--dtype MXFP4 \
@@ -111,13 +112,13 @@ Model with mixed precision is not supported in vLLM, but supported in transforme
111112
python quantize.py \
112113
--model_name_or_path meta-llama/Llama-3.1-8B-Instruct \
113114
--quantize \
114-
--iters 0 \
115115
--dtype MXFP4 \
116116
--use_recipe \
117117
--recipe_file recipes/Meta-Llama-3.1-8B-Instruct_7bits.json \
118118
--save \
119119
--save_format auto_round \
120-
--save_path Llama-3.1-8B-Instruct-MXFP4-MXFP8-AR
120+
--save_path Llama-3.1-8B-Instruct-MXFP4-MXFP8-AR \
121+
--enable_torch_compile
121122

122123
# Command to inference with transformer:
123124
python run_hf_inf.py Llama-3.1-8B-Instruct-MXFP4-MXFP8-AR

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/quantize.py

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,14 @@ def initialize_model_and_tokenizer(model_name_or_path):
7171
parser.add_argument("--device_map", type=str, default=None, help="device map for model")
7272
parser.add_argument("--use_recipe", action="store_true", help="whether to use recipe to quantize model")
7373
parser.add_argument("--recipe_file", type=str, default="recipes/Meta-Llama-3.1-8B-Instruct_6bits.json", help="path of recipe file")
74+
parser.add_argument("--mem_per_param_scale", default=13, type=int, help="memory per param scale factor")
7475
parser.add_argument("--iters", default=200, type=int, help="iters for autoround.")
7576
parser.add_argument("--seqlen", default=2048, type=int, help="sequence length for autoround.")
7677
parser.add_argument("--nsamples", default=128, type=int, help="number of samples for autoround.")
7778
parser.add_argument("--save", action="store_true", help="whether to save the quantized model")
7879
parser.add_argument("--save_path", type=str, default="saved_results", help="path to save the quantized model")
7980
parser.add_argument("--save_format", type=str, default="auto_round", help="format to save the quantized model")
81+
parser.add_argument("--enable_torch_compile", action="store_true", help="whether to enable torch.compile")
8082
parser.add_argument("--quant_lm_head", action="store_true", help="whether to quantize lm_head")
8183
parser.add_argument("--accuracy", action="store_true", help="accuracy measurement")
8284
parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.")
@@ -101,23 +103,29 @@ def initialize_model_and_tokenizer(model_name_or_path):
101103

102104
model, tokenizer = initialize_model_and_tokenizer(args.model_name_or_path)
103105
device="hpu" if is_hpex_available() else "cuda"
106+
# in case that model is set to cuda:0 by default
107+
if args.device_map.isdigit() and device=="cuda":
108+
device = f"{device}:{args.device_map}"
104109

105110
if args.quantize:
106-
autoround_dtype_mapping = {
107-
"MXFP4": "mx_fp4",
108-
"MXFP8": "mx_fp8",
109-
"NVFP4": "nv_fp4",
110-
"uNVFP4": "fp4_v2",
111-
"NVFP4+": "fp4_v2",
112-
}
113-
args.dtype = autoround_dtype_mapping[args.dtype]
111+
if args.dtype in ["uNVFP4", "NVFP4+"]:
112+
from auto_round.schemes import QuantizationScheme
113+
114+
uNVFP4 = QuantizationScheme.from_dict(
115+
{
116+
"bits": 4,
117+
"group_size": 16,
118+
"data_type": "fp4_v2",
119+
"act_bits": 4,
120+
"act_data_type": "fp4_v2",
121+
"act_group_size": 16,
122+
"act_sym": True,
123+
}
124+
)
125+
args.dtype = uNVFP4
126+
114127
if args.quant_lm_head:
115-
lm_head_config = {
116-
"group_size": 32 if "mx" in args.dtype else 16,
117-
"data_type": args.dtype,
118-
"act_data_type": "fp4_v2_with_global_scale" if "fp4_v2" in args.dtype else args.dtype,
119-
}
120-
layer_config = {"lm_head": lm_head_config}
128+
layer_config = {"lm_head": args.dtype}
121129

122130
autoround = AutoRound(
123131
model,
@@ -128,10 +136,10 @@ def initialize_model_and_tokenizer(model_name_or_path):
128136
seqlen=args.seqlen,
129137
nsamples=args.nsamples,
130138
low_gpu_mem_usage=True,
131-
group_size=32 if "mx" in args.dtype else 16,
132-
data_type=args.dtype,
133-
act_data_type="fp4_v2_with_global_scale" if "fp4_v2" in args.dtype else args.dtype,
139+
scheme=args.dtype,
134140
layer_config=layer_config if args.quant_lm_head else None,
141+
enable_torch_compile=args.enable_torch_compile,
142+
mem_per_param_scale=args.mem_per_param_scale,
135143
)
136144

137145
if args.use_recipe:
@@ -140,20 +148,16 @@ def load_recipe_results(file_path):
140148
import json
141149
with open(file_path, "r") as f:
142150
return json.load(f)
143-
151+
144152
layer_config = load_recipe_results(args.recipe_file)
145153
if args.quant_lm_head:
146-
mxfp8_config = {
147-
"bits": 8,
148-
"group_size": 32,
149-
"data_type": "mx_fp8",
150-
"act_data_type": "mx_fp8",
151-
}
152154
# ensure lm_head is quantized with mxfp8_config
153-
layer_config.update({"lm_head": mxfp8_config})
155+
layer_config.update({"lm_head": "MXFP8"})
154156
print("In recipe mode, lm_head is quantized with MXFP8.")
155157
autoround.layer_config = layer_config
156158

159+
# A placeholder, to pass assertion in AutoRound
160+
autoround.formats = "auto_round"
157161
autoround.quantize()
158162
model = autoround.model
159163

@@ -192,7 +196,6 @@ def load_recipe_results(file_path):
192196
else:
193197
# CUDA evaluation support all tasks.
194198
# gsm8k requires add_bos_token=False for better accuracy for llama model.
195-
# model = torch.compile(model)
196199
args.tasks = ["piqa", "hellaswag", "mmlu", "gsm8k"]
197200
all_accuracy = {}
198201
test_gsm8k = False
@@ -243,7 +246,7 @@ def load_recipe_results(file_path):
243246
print(f"Overall accuracy: {sum(all_accuracy.values())/len(all_accuracy):.4f}")
244247

245248
if args.save:
246-
if args.dtype == "nv_fp4":
249+
if args.dtype == "NVFP4":
247250
# using llm_compressor format to save nv_fp4 model
248251
autoround.save_quantized(args.save_path, format=args.save_format)
249252
else:

0 commit comments

Comments
 (0)