fix typos and resolve code review issues

DannyYuyang-quic · DannyYuyang-quic · commit f5b39166274f · 2025-11-18T12:04:15.000+08:00
`
diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
@@ -137,18 +137,32 @@ def get_8a8w_qnn_ptq_config(
 
 
 def get_8a4w_qnn_ptq_config(
-    act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
+    act_symmetric: bool = True, act_observer=MovingAverageMinMaxObserver
 ) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-12}
 
-    act_quantization_spec = QuantizationSpec(
-        dtype=torch.uint8,
-        qscheme=(
-            torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
-        ),
-        ch_axis=0,
-        observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
-    )
+    if act_symmetric:
+        # If zero_point is 128, htp can do optimizations.
+        # If we keep quant_min and quant_max none, observer will default use 128 as zero_point.
+        # If we provide uint8 quant_min/max, it will use 127 as zero_point, which is undesired.
+        act_quantization_spec = QuantizationSpec(
+            dtype=torch.uint8,
+            qscheme=torch.per_tensor_symmetric,
+            ch_axis=0,
+            observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
+        )
+    else:
+        # PyTorch will remove redundant observers based on attributes such as:
+        # dtype, quant_min, quant_max, ch_axis, etc.
+        # Providing values like quant_min and quant_max can help observers compare
+        # and further reduce the number of observers.
+        act_quantization_spec = QuantizationSpec(
+            dtype=torch.uint8,
+            quant_min=torch.iinfo(torch.uint8).min,
+            quant_max=torch.iinfo(torch.uint8).max,
+            qscheme=torch.per_tensor_affine,
+            observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
+        )
 
     weight_quantization_spec = QuantizationSpec(
         dtype=torch.int8,
diff --git a/backends/qualcomm/quantizer/quant_recipe.py b/backends/qualcomm/quantizer/quant_recipe.py
@@ -101,7 +101,7 @@ def get_quant_config(self, node: torch.fx.Node) -> Optional[QuantizationConfig]:
         op: OpOverload = node.target
 
         if not self._matches(node):
-            return
+            return None
 
         if self.granularity == QuantGranularity.PER_TENSOR:
             return self.quant_config.quant_config
@@ -121,6 +121,11 @@ def get_quant_config(self, node: torch.fx.Node) -> Optional[QuantizationConfig]:
             config = self.quant_config.per_block_quant_config_list[ch_axis]
             config.block_size = self.extra_kwargs["block_size"]
             return config
+        else:
+            raise ValueError(
+                f"Unsupported quantization granularity: {self.granularity}. "
+                f"Supported values: {[granularity.name for granularity in QuantGranularity]}"
+            )
 
 
 class ByNodeTarget(QuantizationStrategy):
@@ -364,12 +369,12 @@ def add_regex(
 
     def summary(self, max_rows: int = -1):
         if not self._pending_annotate_nodes:
-            return
+            return None
 
         headers = [
             "module_stack",
             "op_target",
-            "quatize",
+            "quantize",
             "act_observer",
             "granularity",
             "note",
diff --git a/examples/qualcomm/oss_scripts/llama/static_llm_quant_recipe.py b/examples/qualcomm/oss_scripts/llama/static_llm_quant_recipe.py
@@ -72,7 +72,7 @@ def __init__(self, verbose: bool = False):
                 granularity=QuantGranularity.PER_CHANNEL,
             )
             .add_regex(
-                {r"output.conv"},
+                {r"output\.conv"},
                 QuantDtype.use_16a8w,
                 False,
                 act_observer=MinMaxObserver,
@@ -120,7 +120,7 @@ def __init__(self, verbose: bool = False):
                 granularity=QuantGranularity.PER_CHANNEL,
             )
             .add_regex(
-                {r"output.conv"},
+                {r"output\.conv"},
                 QuantDtype.use_16a8w,
                 False,
                 act_observer=MinMaxObserver,
@@ -158,8 +158,8 @@ def __init__(self, verbose: bool = False):
             )
             .add_regex(
                 {
-                    r"output.conv",
-                    r"layers.[0-3].feed_forward.w2_conv",
+                    r"output\.conv",
+                    r"layers\.[0-3]\.feed_forward\.w2_conv",
                 },
                 QuantDtype.use_16a8w,
                 False,
@@ -197,8 +197,8 @@ def __init__(self, verbose: bool = False):
             )
             .add_regex(
                 {
-                    r"output.conv",
-                    r"layers.2[1-7].feed_forward.w2_conv",
+                    r"output\.conv",
+                    r"layers\.2[1-7]\.feed_forward\.w2_conv",
                 },
                 QuantDtype.use_16a8w,
                 False,
@@ -259,7 +259,7 @@ def __init__(self, verbose: bool = False):
             .add_regex(
                 {
                     r"layers\..*\.attention\.wv.*",
-                    r"output.conv",
+                    r"output\.conv",
                 },
                 QuantDtype.use_16a8w,
                 False,
@@ -376,7 +376,7 @@ def __init__(self, verbose: bool = False):
                 granularity=QuantGranularity.PER_CHANNEL,
             )
             .add_regex(
-                {r"output.conv"},
+                {r"output\.conv"},
                 QuantDtype.use_16a8w,
                 False,
                 act_observer=MinMaxObserver,
@@ -435,7 +435,7 @@ def __init__(self, verbose: bool = False):
                 extra_kwargs={"block_size": (1, 16, 1, 1)},
             )
             .add_regex(
-                {r"output.conv"},
+                {r"output\.conv"},
                 QuantDtype.use_16a8w,
                 False,
                 act_observer=MinMaxObserver,
@@ -470,7 +470,7 @@ def __init__(self, verbose: bool = False):
             )
             .add_regex(
                 {
-                    r"layers.*.feed_forward.w2_conv",
+                    r"layers\..*\.feed_forward\.w2_conv",
                 },
                 QuantDtype.use_16a8w,
                 False,
@@ -506,7 +506,7 @@ def __init__(self, verbose: bool = False):
             )
             .add_regex(
                 {
-                    r"output.conv",
+                    r"output\.conv",
                 },
                 QuantDtype.use_16a8w,
                 False,
@@ -578,7 +578,7 @@ def __init__(self, verbose: bool = False):
             )
             .add_regex(
                 {
-                    r"output.conv",
+                    r"output\.conv",
                 },
                 QuantDtype.use_16a8w,
                 False,