Skip to content

Commit f5b3916

Browse files
fix typos and resolve code review issues
`
1 parent f0f016e commit f5b3916

File tree

3 files changed

+43
-24
lines changed

3 files changed

+43
-24
lines changed

backends/qualcomm/quantizer/qconfig.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -137,18 +137,32 @@ def get_8a8w_qnn_ptq_config(
137137

138138

139139
def get_8a4w_qnn_ptq_config(
140-
act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
140+
act_symmetric: bool = True, act_observer=MovingAverageMinMaxObserver
141141
) -> QuantizationConfig:
142142
extra_args: Dict[str, Any] = {"eps": 2**-12}
143143

144-
act_quantization_spec = QuantizationSpec(
145-
dtype=torch.uint8,
146-
qscheme=(
147-
torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
148-
),
149-
ch_axis=0,
150-
observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
151-
)
144+
if act_symmetric:
145+
# If zero_point is 128, htp can do optimizations.
146+
# If we keep quant_min and quant_max none, observer will default use 128 as zero_point.
147+
# If we provide uint8 quant_min/max, it will use 127 as zero_point, which is undesired.
148+
act_quantization_spec = QuantizationSpec(
149+
dtype=torch.uint8,
150+
qscheme=torch.per_tensor_symmetric,
151+
ch_axis=0,
152+
observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
153+
)
154+
else:
155+
# PyTorch will remove redundant observers based on attributes such as:
156+
# dtype, quant_min, quant_max, ch_axis, etc.
157+
# Providing values like quant_min and quant_max can help observers compare
158+
# and further reduce the number of observers.
159+
act_quantization_spec = QuantizationSpec(
160+
dtype=torch.uint8,
161+
quant_min=torch.iinfo(torch.uint8).min,
162+
quant_max=torch.iinfo(torch.uint8).max,
163+
qscheme=torch.per_tensor_affine,
164+
observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
165+
)
152166

153167
weight_quantization_spec = QuantizationSpec(
154168
dtype=torch.int8,

backends/qualcomm/quantizer/quant_recipe.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def get_quant_config(self, node: torch.fx.Node) -> Optional[QuantizationConfig]:
101101
op: OpOverload = node.target
102102

103103
if not self._matches(node):
104-
return
104+
return None
105105

106106
if self.granularity == QuantGranularity.PER_TENSOR:
107107
return self.quant_config.quant_config
@@ -121,6 +121,11 @@ def get_quant_config(self, node: torch.fx.Node) -> Optional[QuantizationConfig]:
121121
config = self.quant_config.per_block_quant_config_list[ch_axis]
122122
config.block_size = self.extra_kwargs["block_size"]
123123
return config
124+
else:
125+
raise ValueError(
126+
f"Unsupported quantization granularity: {self.granularity}. "
127+
f"Supported values: {[granularity.name for granularity in QuantGranularity]}"
128+
)
124129

125130

126131
class ByNodeTarget(QuantizationStrategy):
@@ -364,12 +369,12 @@ def add_regex(
364369

365370
def summary(self, max_rows: int = -1):
366371
if not self._pending_annotate_nodes:
367-
return
372+
return None
368373

369374
headers = [
370375
"module_stack",
371376
"op_target",
372-
"quatize",
377+
"quantize",
373378
"act_observer",
374379
"granularity",
375380
"note",

examples/qualcomm/oss_scripts/llama/static_llm_quant_recipe.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def __init__(self, verbose: bool = False):
7272
granularity=QuantGranularity.PER_CHANNEL,
7373
)
7474
.add_regex(
75-
{r"output.conv"},
75+
{r"output\.conv"},
7676
QuantDtype.use_16a8w,
7777
False,
7878
act_observer=MinMaxObserver,
@@ -120,7 +120,7 @@ def __init__(self, verbose: bool = False):
120120
granularity=QuantGranularity.PER_CHANNEL,
121121
)
122122
.add_regex(
123-
{r"output.conv"},
123+
{r"output\.conv"},
124124
QuantDtype.use_16a8w,
125125
False,
126126
act_observer=MinMaxObserver,
@@ -158,8 +158,8 @@ def __init__(self, verbose: bool = False):
158158
)
159159
.add_regex(
160160
{
161-
r"output.conv",
162-
r"layers.[0-3].feed_forward.w2_conv",
161+
r"output\.conv",
162+
r"layers\.[0-3]\.feed_forward\.w2_conv",
163163
},
164164
QuantDtype.use_16a8w,
165165
False,
@@ -197,8 +197,8 @@ def __init__(self, verbose: bool = False):
197197
)
198198
.add_regex(
199199
{
200-
r"output.conv",
201-
r"layers.2[1-7].feed_forward.w2_conv",
200+
r"output\.conv",
201+
r"layers\.2[1-7]\.feed_forward\.w2_conv",
202202
},
203203
QuantDtype.use_16a8w,
204204
False,
@@ -259,7 +259,7 @@ def __init__(self, verbose: bool = False):
259259
.add_regex(
260260
{
261261
r"layers\..*\.attention\.wv.*",
262-
r"output.conv",
262+
r"output\.conv",
263263
},
264264
QuantDtype.use_16a8w,
265265
False,
@@ -376,7 +376,7 @@ def __init__(self, verbose: bool = False):
376376
granularity=QuantGranularity.PER_CHANNEL,
377377
)
378378
.add_regex(
379-
{r"output.conv"},
379+
{r"output\.conv"},
380380
QuantDtype.use_16a8w,
381381
False,
382382
act_observer=MinMaxObserver,
@@ -435,7 +435,7 @@ def __init__(self, verbose: bool = False):
435435
extra_kwargs={"block_size": (1, 16, 1, 1)},
436436
)
437437
.add_regex(
438-
{r"output.conv"},
438+
{r"output\.conv"},
439439
QuantDtype.use_16a8w,
440440
False,
441441
act_observer=MinMaxObserver,
@@ -470,7 +470,7 @@ def __init__(self, verbose: bool = False):
470470
)
471471
.add_regex(
472472
{
473-
r"layers.*.feed_forward.w2_conv",
473+
r"layers\..*\.feed_forward\.w2_conv",
474474
},
475475
QuantDtype.use_16a8w,
476476
False,
@@ -506,7 +506,7 @@ def __init__(self, verbose: bool = False):
506506
)
507507
.add_regex(
508508
{
509-
r"output.conv",
509+
r"output\.conv",
510510
},
511511
QuantDtype.use_16a8w,
512512
False,
@@ -578,7 +578,7 @@ def __init__(self, verbose: bool = False):
578578
)
579579
.add_regex(
580580
{
581-
r"output.conv",
581+
r"output\.conv",
582582
},
583583
QuantDtype.use_16a8w,
584584
False,

0 commit comments

Comments
 (0)